diff --git a/CHANGELOG_v16.md b/CHANGELOG_v16.md new file mode 100644 index 0000000..1c724df --- /dev/null +++ b/CHANGELOG_v16.md @@ -0,0 +1,58 @@ +# Changelog + +All notable changes to this project will be documented in this file. +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.6.0] - 2025-08-22 +### Added +- **Warp‑aggregated atomics** for distinguished point (DP) emission in GPU kernels. +- **Spanish README** updates mirroring English documentation. +- **Backward compatible reader** for legacy `.dat` v1.5 files. + +### Changed +- **New compact `.dat` format (v1.6 / `TMBM16`)**: DP record size reduced from 32B → **28B** (X tail 5B, distance 22B, type 1B). +- **Improved memory coalescing** when writing DPs, enabling more efficient PCIe transfers. +- **Documentation**: Updated `README.md` and `README_es.md` with a “What’s New in v1.6” section, benchmarks, and build tips. + +### Performance +- Throughput improvements of **+10–30%** (GPU/`-dp` dependent). Example: RTX 3060 `-dp 16` ~**+16%** over v1.5. + +### Compatibility +- v1.6 binaries **read** both `.dat` **v1.5** and **v1.6** formats. +- v1.6 **writes** `.dat` in the new v1.6 format by default. + +### Migration +- No action required for existing `.dat` v1.5 users; they continue to load. +- New runs will generate `.dat` v1.6 files (smaller on disk). + +--- + +## [1.5.0] - 2024-XX-XX +- Initial public release by RetiredC. +- GPU implementation of Pollard Kangaroo with DP infrastructure. + +--- + +# Registro de cambios (ES) + +## [1.6.0] - 2025-08-22 +### Añadido +- **Atómicas warp‑aggregadas** para la emisión de DPs en los kernels de GPU. +- **README en español** actualizado en paralelo al inglés. +- **Lectura retrocompatible** de archivos `.dat` v1.5. + +### Cambiado +- **Nuevo formato `.dat` (v1.6 / `TMBM16`)**: registro DP de 32B → **28B** (cola de X 5B, distancia 22B, tipo 1B). +- **Mejor coalescencia de memoria** al escribir DPs, con transferencias PCIe más eficientes. +- **Documentación**: `README.md` y `README_es.md` incluyen “Novedades v1.6”, benchmarks y flags de compilación. + +### Rendimiento +- Mejora de **+10–30%** (según GPU y `-dp`). Ejemplo: RTX 3060 `-dp 16` ~**+16%** vs v1.5. + +### Compatibilidad +- Los binarios v1.6 **leen** archivos `.dat` **v1.5** y **v1.6**. +- v1.6 **escribe** por defecto el nuevo formato `.dat` v1.6. + +### Migración +- No se requiere acción para usuarios con `.dat` v1.5; siguen cargando. +- Nuevas ejecuciones generarán `.dat` v1.6 (más pequeño en disco). diff --git a/Ec.cpp b/Ec.cpp index c598f39..84b4742 100644 --- a/Ec.cpp +++ b/Ec.cpp @@ -1,741 +1,829 @@ -// This file is a part of RCKangaroo software -// (c) 2024, RetiredCoder (RC) -// License: GPLv3, see "LICENSE.TXT" file -// https://github.com/RetiredC - - -#include "defs.h" -#include "Ec.h" -#include -#include "utils.h" - -// https://en.bitcoin.it/wiki/Secp256k1 -EcInt g_P; //FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F -EcPoint g_G; //Generator point - -#define P_REV 0x00000001000003D1 - -#ifdef DEBUG_MODE -u8* GTable = NULL; //16x16-bit table -#endif - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -bool parse_u8(const char* s, u8* res) -{ - char cl = toupper(s[1]); - char ch = toupper(s[0]); - if (((cl < '0') || (cl > '9')) && ((cl < 'A') || (cl > 'F'))) - return false; - if (((ch < '0') || (ch > '9')) && ((ch < 'A') || (ch > 'F'))) - return false; - u8 l = ((cl >= '0') && (cl <= '9')) ? (cl - '0') : (cl - 'A' + 10); - u8 h = ((ch >= '0') && (ch <= '9')) ? (ch - '0') : (ch - 'A' + 10); - *res = l + (h << 4); - return true; -} - -bool EcPoint::IsEqual(EcPoint& pnt) -{ - return this->x.IsEqual(pnt.x) && this->y.IsEqual(pnt.y); -} - -void EcPoint::LoadFromBuffer64(u8* buffer) -{ - memcpy(x.data, buffer, 32); - x.data[4] = 0; - memcpy(y.data, buffer + 32, 32); - y.data[4] = 0; -} - -void EcPoint::SaveToBuffer64(u8* buffer) -{ - memcpy(buffer, x.data, 32); - memcpy(buffer + 32, y.data, 32); -} - -bool EcPoint::SetHexStr(const char* str) -{ - EcPoint res; - int len = (int)strlen(str); - if (len < 66) - return false; - u8 type, b; - if (!parse_u8(str, &type)) - return false; - if ((type < 2) || (type > 4)) - return false; - if (((type == 2) || (type == 3)) && (len != 66)) - return false; - if ((type == 4) && (len != 130)) - return false; - - if (len == 66) //compressed - { - str += 2; - for (int i = 0; i < 32; i++) - { - if (!parse_u8(str + 2 * i, &b)) - return false; - ((u8*)res.x.data)[31 - i] = b; - } - res.y = Ec::CalcY(res.x, type == 2); - if (!Ec::IsValidPoint(res)) - return false; - *this = res; - return true; - } - //uncompressed - str += 2; - for (int i = 0; i < 32; i++) - { - if (!parse_u8(str + 2 * i, &b)) - return false; - ((u8*)res.x.data)[31 - i] = b; - - if (!parse_u8(str + 2 * i + 64, &b)) - return false; - ((u8*)res.y.data)[31 - i] = b; - } - if (!Ec::IsValidPoint(res)) - return false; - *this = res; - return true; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -// https://en.bitcoin.it/wiki/Secp256k1 -void InitEc() -{ - g_P.SetHexStr("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F"); //Fp - g_G.x.SetHexStr("79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798"); //G.x - g_G.y.SetHexStr("483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8"); //G.y -#ifdef DEBUG_MODE - GTable = (u8*)malloc(16 * 256 * 256 * 64); - EcPoint pnt = g_G; - for (int i = 0; i < 16; i++) - { - pnt.SaveToBuffer64(GTable + (i * 256 * 256) * 64); - EcPoint tmp = pnt; - pnt = Ec::DoublePoint(pnt); - for (int j = 1; j < 256 * 256 - 1; j++) - { - pnt.SaveToBuffer64(GTable + (i * 256 * 256 + j) * 64); - pnt = Ec::AddPoints(pnt, tmp); - } - } -#endif -}; - -void DeInitEc() -{ -#ifdef DEBUG_MODE - if (GTable) - free(GTable); -#endif -} - -// https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_addition -EcPoint Ec::AddPoints(EcPoint& pnt1, EcPoint& pnt2) -{ - EcPoint res; - EcInt dx, dy, lambda, lambda2; - - dx = pnt2.x; - dx.SubModP(pnt1.x); - dx.InvModP(); - - dy = pnt2.y; - dy.SubModP(pnt1.y); - - lambda = dy; - lambda.MulModP(dx); - lambda2 = lambda; - lambda2.MulModP(lambda); - - res.x = lambda2; - res.x.SubModP(pnt1.x); - res.x.SubModP(pnt2.x); - - res.y = pnt2.x; - res.y.SubModP(res.x); - res.y.MulModP(lambda); - res.y.SubModP(pnt2.y); - return res; -} - -// https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_doubling -EcPoint Ec::DoublePoint(EcPoint& pnt) -{ - EcPoint res; - EcInt t1, t2, lambda, lambda2; - - t1 = pnt.y; - t1.AddModP(pnt.y); - t1.InvModP(); - - t2 = pnt.x; - t2.MulModP(pnt.x); - lambda = t2; - lambda.AddModP(t2); - lambda.AddModP(t2); - lambda.MulModP(t1); - lambda2 = lambda; - lambda2.MulModP(lambda); - - res.x = lambda2; - res.x.SubModP(pnt.x); - res.x.SubModP(pnt.x); - - res.y = pnt.x; - res.y.SubModP(res.x); - res.y.MulModP(lambda); - res.y.SubModP(pnt.y); - return res; -} - -//k up to 256 bits -EcPoint Ec::MultiplyG(EcInt& k) -{ - EcPoint res; - EcPoint t = g_G; - bool first = true; - int n = 3; - while ((n >= 0) && !k.data[n]) - n--; - if (n < 0) - return res; //error - int index; - _BitScanReverse64((DWORD*)&index, k.data[n]); - for (int i = 0; i <= 64 * n + index; i++) - { - u8 v = (k.data[i / 64] >> (i % 64)) & 1; - if (v) - { - if (first) - { - first = false; - res = t; - } - else - res = Ec::AddPoints(res, t); - } - t = Ec::DoublePoint(t); - } - return res; -} - -#ifdef DEBUG_MODE -//uses gTable (16x16-bit) to speedup calculation -EcPoint Ec::MultiplyG_Fast(EcInt& k) -{ - int i; - u16 b; - EcPoint pnt, res; - for (i = 0; i < 16; i++) - { - b = k.GetU16(i); - if (b) - break; - } - if (i >= 16) - return pnt; - if (i < 16) - { - res.LoadFromBuffer64(GTable + (256 * 256 * i + (b - 1)) * 64); - i++; - } - while (i < 16) - { - b = k.GetU16(i); - if (b) - { - pnt.LoadFromBuffer64(GTable + (256 * 256 * i + (b - 1)) * 64); - res = AddPoints(res, pnt); - } - i++; - } - return res; -} -#endif - -EcInt Ec::CalcY(EcInt& x, bool is_even) -{ - EcInt res; - EcInt tmp; - tmp.Set(7); - res = x; - res.MulModP(x); - res.MulModP(x); - res.AddModP(tmp); - res.SqrtModP(); - if ((res.data[0] & 1) == is_even) - res.NegModP(); - return res; -} - -bool Ec::IsValidPoint(EcPoint& pnt) -{ - EcInt x, y, seven; - seven.Set(7); - x = pnt.x; - x.MulModP(pnt.x); - x.MulModP(pnt.x); - x.AddModP(seven); - y = pnt.y; - y.MulModP(pnt.y); - return x.IsEqual(y); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -void Mul256_by_64(u64* input, u64 multiplier, u64* result) -{ - u64 h1, h2; - result[0] = _umul128(input[0], multiplier, &h1); - u8 carry = _addcarry_u64(0, _umul128(input[1], multiplier, &h2), h1, result + 1); - carry = _addcarry_u64(carry, _umul128(input[2], multiplier, &h1), h2, result + 2); - carry = _addcarry_u64(carry, _umul128(input[3], multiplier, &h2), h1, result + 3); - _addcarry_u64(carry, 0, h2, result + 4); -} - -void Mul320_by_64(u64* input, u64 multiplier, u64* result) -{ - u64 h1, h2; - result[0] = _umul128(input[0], multiplier, &h1); - u8 carry = _addcarry_u64(0, _umul128(input[1], multiplier, &h2), h1, result + 1); - carry = _addcarry_u64(carry, _umul128(input[2], multiplier, &h1), h2, result + 2); - carry = _addcarry_u64(carry, _umul128(input[3], multiplier, &h2), h1, result + 3); - _addcarry_u64(carry, _umul128(input[4], multiplier, &h1), h2, result + 4); -} - -void Add320_to_256(u64* in_out, u64* val) -{ - u8 c = _addcarry_u64(0, in_out[0], val[0], in_out); - c = _addcarry_u64(c, in_out[1], val[1], in_out + 1); - c = _addcarry_u64(c, in_out[2], val[2], in_out + 2); - c = _addcarry_u64(c, in_out[3], val[3], in_out + 3); - _addcarry_u64(c, 0, val[4], in_out + 4); -} - -EcInt::EcInt() -{ - SetZero(); -} - -void EcInt::Assign(EcInt& val) -{ - memcpy(data, val.data, sizeof(data)); -} - -void EcInt::Set(u64 val) -{ - SetZero(); - data[0] = val; -} - -void EcInt::SetZero() -{ - memset(data, 0, sizeof(data)); -} - -bool EcInt::SetHexStr(const char* str) -{ - SetZero(); - int len = (int)strlen(str); - if (len > 64) - return false; - char s[64]; - memset(s, '0', 64); - memcpy(s + 64 - len, str, len); - for (int i = 0; i < 32; i++) - { - int n = 62 - 2 * i; - u8 b; - if (!parse_u8(s + n, &b)) - return false; - ((u8*)data)[i] = b; - } - return true; -} - -void EcInt::GetHexStr(char* str) -{ - for (int i = 0; i < 32; i++) - sprintf(str + 2 * i, "%02X", ((u8*)data)[31 - i]); - str[64] = 0; -} - -u16 EcInt::GetU16(int index) -{ - return (u16)(data[index / 4] >> (16 * (index % 4))); -} - -//returns carry -bool EcInt::Add(EcInt& val) -{ - u8 c = _addcarry_u64(0, data[0], val.data[0], data + 0); - c = _addcarry_u64(c, data[1], val.data[1], data + 1); - c = _addcarry_u64(c, data[2], val.data[2], data + 2); - c = _addcarry_u64(c, data[3], val.data[3], data + 3); - return _addcarry_u64(c, data[4], val.data[4], data + 4) != 0; -} - -//returns carry -bool EcInt::Sub(EcInt& val) -{ - u8 c = _subborrow_u64(0, data[0], val.data[0], data + 0); - c = _subborrow_u64(c, data[1], val.data[1], data + 1); - c = _subborrow_u64(c, data[2], val.data[2], data + 2); - c = _subborrow_u64(c, data[3], val.data[3], data + 3); - return _subborrow_u64(c, data[4], val.data[4], data + 4) != 0; -} - -void EcInt::Neg() -{ - u8 c = _subborrow_u64(0, 0, data[0], data + 0); - c = _subborrow_u64(c, 0, data[1], data + 1); - c = _subborrow_u64(c, 0, data[2], data + 2); - c = _subborrow_u64(c, 0, data[3], data + 3); - _subborrow_u64(c, 0, data[4], data + 4); -} - -void EcInt::Neg256() -{ - u8 c = _subborrow_u64(0, 0, data[0], data + 0); - c = _subborrow_u64(c, 0, data[1], data + 1); - c = _subborrow_u64(c, 0, data[2], data + 2); - c = _subborrow_u64(c, 0, data[3], data + 3); - data[4] = 0; -} - -bool EcInt::IsLessThanU(EcInt& val) -{ - int i = 4; - while (i >= 0) - { - if (data[i] != val.data[i]) - break; - i--; - } - if (i < 0) - return false; - return data[i] < val.data[i]; -} - -bool EcInt::IsLessThanI(EcInt& val) -{ - if ((data[4] >> 63) && !(val.data[4] >> 63)) - return true; - if (!(data[4] >> 63) && (val.data[4] >> 63)) - return false; - - int i = 4; - while (i >= 0) - { - if (data[i] != val.data[i]) - break; - i--; - } - if (i < 0) - return false; - return data[i] < val.data[i]; -} - -bool EcInt::IsEqual(EcInt& val) -{ - return memcmp(val.data, this->data, 40) == 0; -} - -bool EcInt::IsZero() -{ - return ((data[0] == 0) && (data[1] == 0) && (data[2] == 0) && (data[3] == 0) && (data[4] == 0)); -} - -void EcInt::AddModP(EcInt& val) -{ - Add(val); - if (!IsLessThanU(g_P)) - Sub(g_P); -} - -void EcInt::SubModP(EcInt& val) -{ - if (Sub(val)) - Add(g_P); -} - -//assume value < P -void EcInt::NegModP() -{ - Neg(); - Add(g_P); -} - -void EcInt::ShiftRight(int nbits) -{ - int offset = nbits / 64; - if (offset) - { - for (int i = 0; i < 5 - offset; i++) - data[i] = data[i + offset]; - for (int i = 5 - offset; i < 5; i++) - data[i] = 0; - nbits -= 64 * offset; - } - data[0] = __shiftright128(data[0], data[1], nbits); - data[1] = __shiftright128(data[1], data[2], nbits); - data[2] = __shiftright128(data[2], data[3], nbits); - data[3] = __shiftright128(data[3], data[4], nbits); - data[4] = ((i64)data[4]) >> nbits; -} - -void EcInt::ShiftLeft(int nbits) -{ - int offset = nbits / 64; - if (offset) - { - for (int i = 4; i >= offset; i--) - data[i] = data[i - offset]; - for (int i = offset - 1; i >= 0; i--) - data[i] = 0; - nbits -= 64 * offset; - } - data[4] = __shiftleft128(data[3], data[4], nbits); - data[3] = __shiftleft128(data[2], data[3], nbits); - data[2] = __shiftleft128(data[1], data[2], nbits); - data[1] = __shiftleft128(data[0], data[1], nbits); - data[0] = data[0] << nbits; -} - -void EcInt::MulModP(EcInt& val) -{ - u64 buff[8], tmp[5], h; - //calc 512 bits - Mul256_by_64(val.data, data[0], buff); - Mul256_by_64(val.data, data[1], tmp); - Add320_to_256(buff + 1, tmp); - Mul256_by_64(val.data, data[2], tmp); - Add320_to_256(buff + 2, tmp); - Mul256_by_64(val.data, data[3], tmp); - Add320_to_256(buff + 3, tmp); - //fast mod P - Mul256_by_64(buff + 4, P_REV, tmp); - u8 c = _addcarry_u64(0, buff[0], tmp[0], buff); - c = _addcarry_u64(c, buff[1], tmp[1], buff + 1); - c = _addcarry_u64(c, buff[2], tmp[2], buff + 2); - tmp[4] += _addcarry_u64(c, buff[3], tmp[3], buff + 3); - c = _addcarry_u64(0, buff[0], _umul128(tmp[4], P_REV, &h), data); - c = _addcarry_u64(c, buff[1], h, data + 1); - c = _addcarry_u64(c, 0, buff[2], data + 2); - data[4] = _addcarry_u64(c, buff[3], 0, data + 3); - while (data[4]) - Sub(g_P); -} - -void EcInt::Mul_u64(EcInt& val, u64 multiplier) -{ - Assign(val); - Mul320_by_64(data, (u64)multiplier, data); -} - -void EcInt::Mul_i64(EcInt& val, i64 multiplier) -{ - Assign(val); - if (multiplier < 0) - { - Neg(); - multiplier = -multiplier; - } - Mul320_by_64(data, (u64)multiplier, data); -} - -#define APPLY_DIV_SHIFT() kbnt -= index; val >>= index; matrix[0] <<= index; matrix[1] <<= index; - -// https://tches.iacr.org/index.php/TCHES/article/download/8298/7648/4494 -//a bit tricky -void DIV_62(i64& kbnt, i64 modp, i64 val, i64* matrix) -{ - int index, cnt; - _BitScanForward64((DWORD*)&index, val | 0x4000000000000000); - APPLY_DIV_SHIFT(); - cnt = 62 - index; - while (cnt > 0) - { - if (kbnt < 0) - { - kbnt = -kbnt; - i64 tmp = -modp; modp = val; val = tmp; - tmp = -matrix[0]; matrix[0] = matrix[2]; matrix[2] = tmp; - tmp = -matrix[1]; matrix[1] = matrix[3]; matrix[3] = tmp; - } - int thr = cnt; - if ((kbnt + 1) < cnt) - thr = (int)(kbnt + 1); - i64 mul = (-modp * val) & ((UINT64_MAX >> (64 - thr)) & 0x07); - val += (modp * mul); - matrix[2] += (matrix[0] * mul); - matrix[3] += (matrix[1] * mul); - _BitScanForward64((DWORD*)&index, val | (1ull << cnt)); - APPLY_DIV_SHIFT(); - cnt -= index; - } -} - -void EcInt::InvModP() -{ - i64 matrix[4]; - EcInt result, a, tmp, tmp2; - EcInt modp, val; - i64 kbnt = -1; - matrix[1] = matrix[2] = 0; - matrix[0] = matrix[3] = 1; - DIV_62(kbnt, g_P.data[0], data[0], matrix); - modp.Mul_i64(g_P, matrix[0]); - tmp.Mul_i64(*this, matrix[1]); - modp.Add(tmp); - modp.ShiftRight(62); - val.Mul_i64(g_P, matrix[2]); - tmp.Mul_i64(*this, matrix[3]); - val.Add(tmp); - val.ShiftRight(62); - if (matrix[1] >= 0) - result.Set(matrix[1]); - else - { - result.Set(-matrix[1]); - result.Neg(); - } - if (matrix[3] >= 0) - a.Set(matrix[3]); - else - { - a.Set(-matrix[3]); - a.Neg(); - } - Mul320_by_64(g_P.data, (result.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp.data); - result.Add(tmp); - result.ShiftRight(62); - Mul320_by_64(g_P.data, (a.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp.data); - a.Add(tmp); - a.ShiftRight(62); - - while (val.data[0] || val.data[1] || val.data[2] || val.data[3]) - { - matrix[1] = matrix[2] = 0; - matrix[0] = matrix[3] = 1; - DIV_62(kbnt, modp.data[0], val.data[0], matrix); - tmp.Mul_i64(modp, matrix[0]); - tmp2.Mul_i64(val, matrix[1]); - tmp.Add(tmp2); - tmp2.Mul_i64(val, matrix[3]); - val.Mul_i64(modp, matrix[2]); - val.Add(tmp2); - val.ShiftRight(62); - modp = tmp; - modp.ShiftRight(62); - tmp.Mul_i64(result, matrix[0]); - tmp2.Mul_i64(a, matrix[1]); - tmp.Add(tmp2); - tmp2.Mul_i64(a, matrix[3]); - a.Mul_i64(result, matrix[2]); - a.Add(tmp2); - Mul320_by_64(g_P.data, (a.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp2.data); - a.Add(tmp2); - a.ShiftRight(62); - Mul320_by_64(g_P.data, (tmp.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp2.data); - result = tmp; - result.Add(tmp2); - result.ShiftRight(62); - } - Assign(result); - if (modp.data[4] >> 63) - { - Neg(); - modp.Neg(); - } - - if (modp.data[0] == 1) - { - if (data[4] >> 63) - Add(g_P); - if (data[4] >> 63) - Add(g_P); - if (!IsLessThanU(g_P)) - Sub(g_P); - if (!IsLessThanU(g_P)) - Sub(g_P); - } - else - SetZero(); //error -} - -// x = a^ { (p + 1) / 4 } mod p -void EcInt::SqrtModP() -{ - EcInt one, res; - one.Set(1); - EcInt exp = g_P; - exp.Add(one); - exp.ShiftRight(2); - res.Set(1); - EcInt cur = *this; - while (!exp.IsZero()) - { - if (exp.data[0] & 1) - res.MulModP(cur); - EcInt tmp = cur; - tmp.MulModP(cur); - cur = tmp; - exp.ShiftRight(1); - } - *this = res; -} - -std::mt19937_64 rng; -CriticalSection cs_rnd; - -void SetRndSeed(u64 seed) -{ - rng.seed(seed); -} - -void EcInt::RndBits(int nbits) -{ - SetZero(); - if (nbits > 256) - nbits = 256; - cs_rnd.Enter(); - for (int i = 0; i < (nbits + 63) / 64; i++) - data[i] = rng(); - cs_rnd.Leave(); - data[nbits / 64] &= (1ull << (nbits % 64)) - 1; -} - -//up to 256 bits only -void EcInt::RndMax(EcInt& max) -{ - SetZero(); - int n = 3; - while ((n >= 0) && !max.data[n]) - n--; - if (n < 0) - return; - u64 val = max.data[n]; - int k = 0; - while ((val & 0x8000000000000000) == 0) - { - val <<= 1; - k++; - } - int bits = 64 * n + (64 - k); - RndBits(bits); - while (!IsLessThanU(max)) // :) - RndBits(bits); -} - - - - - +// This file is a part of RCKangaroo software +// (c) 2024, RetiredCoder (RC) +// License: GPLv3, see "LICENSE.TXT" file +// https://github.com/RetiredC + + +#include "defs.h" +#include "Ec.h" +#include +#include "utils.h" + +// https://en.bitcoin.it/wiki/Secp256k1 +EcInt g_P; //FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F +EcInt g_N; //FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE BAAEDCE6 AF48A03B BFD25E8C D0364141 +EcPoint g_G; //Generator point + +#define P_REV 0x00000001000003D1 + +#ifdef DEBUG_MODE +u8* GTable = NULL; //16x16-bit table +#endif + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +bool parse_u8(const char* s, u8* res) +{ + char cl = toupper(s[1]); + char ch = toupper(s[0]); + if (((cl < '0') || (cl > '9')) && ((cl < 'A') || (cl > 'F'))) + return false; + if (((ch < '0') || (ch > '9')) && ((ch < 'A') || (ch > 'F'))) + return false; + u8 l = ((cl >= '0') && (cl <= '9')) ? (cl - '0') : (cl - 'A' + 10); + u8 h = ((ch >= '0') && (ch <= '9')) ? (ch - '0') : (ch - 'A' + 10); + *res = l + (h << 4); + return true; +} + +bool EcPoint::IsEqual(EcPoint& pnt) +{ + return this->x.IsEqual(pnt.x) && this->y.IsEqual(pnt.y); +} + +void EcPoint::LoadFromBuffer64(u8* buffer) +{ + memcpy(x.data, buffer, 32); + x.data[4] = 0; + memcpy(y.data, buffer + 32, 32); + y.data[4] = 0; +} + +void EcPoint::SaveToBuffer64(u8* buffer) +{ + memcpy(buffer, x.data, 32); + memcpy(buffer + 32, y.data, 32); +} + +bool EcPoint::SetHexStr(const char* str) +{ + EcPoint res; + int len = (int)strlen(str); + if (len < 66) + return false; + u8 type, b; + if (!parse_u8(str, &type)) + return false; + if ((type < 2) || (type > 4)) + return false; + if (((type == 2) || (type == 3)) && (len != 66)) + return false; + if ((type == 4) && (len != 130)) + return false; + + if (len == 66) //compressed + { + str += 2; + for (int i = 0; i < 32; i++) + { + if (!parse_u8(str + 2 * i, &b)) + return false; + ((u8*)res.x.data)[31 - i] = b; + } + res.y = Ec::CalcY(res.x, type == 2); + if (!Ec::IsValidPoint(res)) + return false; + *this = res; + return true; + } + //uncompressed + str += 2; + for (int i = 0; i < 32; i++) + { + if (!parse_u8(str + 2 * i, &b)) + return false; + ((u8*)res.x.data)[31 - i] = b; + + if (!parse_u8(str + 2 * i + 64, &b)) + return false; + ((u8*)res.y.data)[31 - i] = b; + } + if (!Ec::IsValidPoint(res)) + return false; + *this = res; + return true; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// https://en.bitcoin.it/wiki/Secp256k1 +void InitEc() +{ + g_P.SetHexStr("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F"); //Fp + g_G.x.SetHexStr("79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798"); //G.x + g_G.y.SetHexStr("483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8"); //G.y + g_N.SetHexStr("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141"); //order of G +#ifdef DEBUG_MODE + GTable = (u8*)malloc(16 * 256 * 256 * 64); + EcPoint pnt = g_G; + for (int i = 0; i < 16; i++) + { + pnt.SaveToBuffer64(GTable + (i * 256 * 256) * 64); + EcPoint tmp = pnt; + pnt = Ec::DoublePoint(pnt); + for (int j = 1; j < 256 * 256 - 1; j++) + { + pnt.SaveToBuffer64(GTable + (i * 256 * 256 + j) * 64); + pnt = Ec::AddPoints(pnt, tmp); + } + } +#endif +}; + +void DeInitEc() +{ +#ifdef DEBUG_MODE + if (GTable) + free(GTable); +#endif +} + +// https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_addition +EcPoint Ec::AddPoints(EcPoint& pnt1, EcPoint& pnt2) +{ + EcPoint res; + EcInt dx, dy, lambda, lambda2; + + dx = pnt2.x; + dx.SubModP(pnt1.x); + dx.InvModP(); + + dy = pnt2.y; + dy.SubModP(pnt1.y); + + lambda = dy; + lambda.MulModP(dx); + lambda2 = lambda; + lambda2.MulModP(lambda); + + res.x = lambda2; + res.x.SubModP(pnt1.x); + res.x.SubModP(pnt2.x); + + res.y = pnt2.x; + res.y.SubModP(res.x); + res.y.MulModP(lambda); + res.y.SubModP(pnt2.y); + return res; +} + +// https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_doubling +EcPoint Ec::DoublePoint(EcPoint& pnt) +{ + EcPoint res; + EcInt t1, t2, lambda, lambda2; + + t1 = pnt.y; + t1.AddModP(pnt.y); + t1.InvModP(); + + t2 = pnt.x; + t2.MulModP(pnt.x); + lambda = t2; + lambda.AddModP(t2); + lambda.AddModP(t2); + lambda.MulModP(t1); + lambda2 = lambda; + lambda2.MulModP(lambda); + + res.x = lambda2; + res.x.SubModP(pnt.x); + res.x.SubModP(pnt.x); + + res.y = pnt.x; + res.y.SubModP(res.x); + res.y.MulModP(lambda); + res.y.SubModP(pnt.y); + return res; +} + +//k up to 256 bits +EcPoint Ec::MultiplyG(EcInt& k) +{ + EcPoint res; + EcPoint t = g_G; + bool first = true; + int n = 3; + while ((n >= 0) && !k.data[n]) + n--; + if (n < 0) + return res; //error + int index; + _BitScanReverse64((DWORD*)&index, k.data[n]); + for (int i = 0; i <= 64 * n + index; i++) + { + u8 v = (k.data[i / 64] >> (i % 64)) & 1; + if (v) + { + if (first) + { + first = false; + res = t; + } + else + res = Ec::AddPoints(res, t); + } + t = Ec::DoublePoint(t); + } + return res; +} + +#ifdef DEBUG_MODE +//uses gTable (16x16-bit) to speedup calculation +EcPoint Ec::MultiplyG_Fast(EcInt& k) +{ + int i; + u16 b; + EcPoint pnt, res; + for (i = 0; i < 16; i++) + { + b = k.GetU16(i); + if (b) + break; + } + if (i >= 16) + return pnt; + if (i < 16) + { + res.LoadFromBuffer64(GTable + (256 * 256 * i + (b - 1)) * 64); + i++; + } + while (i < 16) + { + b = k.GetU16(i); + if (b) + { + pnt.LoadFromBuffer64(GTable + (256 * 256 * i + (b - 1)) * 64); + res = AddPoints(res, pnt); + } + i++; + } + return res; +} +#endif + +EcInt Ec::CalcY(EcInt& x, bool is_even) +{ + EcInt res; + EcInt tmp; + tmp.Set(7); + res = x; + res.MulModP(x); + res.MulModP(x); + res.AddModP(tmp); + res.SqrtModP(); + if ((res.data[0] & 1) == is_even) + res.NegModP(); + return res; +} + +bool Ec::IsValidPoint(EcPoint& pnt) +{ + EcInt x, y, seven; + seven.Set(7); + x = pnt.x; + x.MulModP(pnt.x); + x.MulModP(pnt.x); + x.AddModP(seven); + y = pnt.y; + y.MulModP(pnt.y); + return x.IsEqual(y); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void Mul256_by_64(u64* input, u64 multiplier, u64* result) +{ + u64 h1, h2; + result[0] = _umul128(input[0], multiplier, &h1); + u8 carry = _addcarry_u64(0, _umul128(input[1], multiplier, &h2), h1, result + 1); + carry = _addcarry_u64(carry, _umul128(input[2], multiplier, &h1), h2, result + 2); + carry = _addcarry_u64(carry, _umul128(input[3], multiplier, &h2), h1, result + 3); + _addcarry_u64(carry, 0, h2, result + 4); +} + +void Mul320_by_64(u64* input, u64 multiplier, u64* result) +{ + u64 h1, h2; + result[0] = _umul128(input[0], multiplier, &h1); + u8 carry = _addcarry_u64(0, _umul128(input[1], multiplier, &h2), h1, result + 1); + carry = _addcarry_u64(carry, _umul128(input[2], multiplier, &h1), h2, result + 2); + carry = _addcarry_u64(carry, _umul128(input[3], multiplier, &h2), h1, result + 3); + _addcarry_u64(carry, _umul128(input[4], multiplier, &h1), h2, result + 4); +} + +void Add320_to_256(u64* in_out, u64* val) +{ + u8 c = _addcarry_u64(0, in_out[0], val[0], in_out); + c = _addcarry_u64(c, in_out[1], val[1], in_out + 1); + c = _addcarry_u64(c, in_out[2], val[2], in_out + 2); + c = _addcarry_u64(c, in_out[3], val[3], in_out + 3); + _addcarry_u64(c, 0, val[4], in_out + 4); +} + +EcInt::EcInt() +{ + SetZero(); +} + +void EcInt::Assign(EcInt& val) +{ + memcpy(data, val.data, sizeof(data)); +} + +void EcInt::Set(u64 val) +{ + SetZero(); + data[0] = val; +} + +void EcInt::SetZero() +{ + memset(data, 0, sizeof(data)); +} + +bool EcInt::SetHexStr(const char* str) +{ + SetZero(); + int len = (int)strlen(str); + if (len > 64) + return false; + char s[64]; + memset(s, '0', 64); + memcpy(s + 64 - len, str, len); + for (int i = 0; i < 32; i++) + { + int n = 62 - 2 * i; + u8 b; + if (!parse_u8(s + n, &b)) + return false; + ((u8*)data)[i] = b; + } + return true; +} + +void EcInt::GetHexStr(char* str) +{ + for (int i = 0; i < 32; i++) + sprintf(str + 2 * i, "%02X", ((u8*)data)[31 - i]); + str[64] = 0; +} + +u16 EcInt::GetU16(int index) +{ + return (u16)(data[index / 4] >> (16 * (index % 4))); +} + +//returns carry +bool EcInt::Add(EcInt& val) +{ + u8 c = _addcarry_u64(0, data[0], val.data[0], data + 0); + c = _addcarry_u64(c, data[1], val.data[1], data + 1); + c = _addcarry_u64(c, data[2], val.data[2], data + 2); + c = _addcarry_u64(c, data[3], val.data[3], data + 3); + return _addcarry_u64(c, data[4], val.data[4], data + 4) != 0; +} + +//returns carry +bool EcInt::Sub(EcInt& val) +{ + u8 c = _subborrow_u64(0, data[0], val.data[0], data + 0); + c = _subborrow_u64(c, data[1], val.data[1], data + 1); + c = _subborrow_u64(c, data[2], val.data[2], data + 2); + c = _subborrow_u64(c, data[3], val.data[3], data + 3); + return _subborrow_u64(c, data[4], val.data[4], data + 4) != 0; +} + +void EcInt::Neg() +{ + u8 c = _subborrow_u64(0, 0, data[0], data + 0); + c = _subborrow_u64(c, 0, data[1], data + 1); + c = _subborrow_u64(c, 0, data[2], data + 2); + c = _subborrow_u64(c, 0, data[3], data + 3); + _subborrow_u64(c, 0, data[4], data + 4); +} + +void EcInt::Neg256() +{ + u8 c = _subborrow_u64(0, 0, data[0], data + 0); + c = _subborrow_u64(c, 0, data[1], data + 1); + c = _subborrow_u64(c, 0, data[2], data + 2); + c = _subborrow_u64(c, 0, data[3], data + 3); + data[4] = 0; +} + +bool EcInt::IsLessThanU(EcInt& val) +{ + int i = 4; + while (i >= 0) + { + if (data[i] != val.data[i]) + break; + i--; + } + if (i < 0) + return false; + return data[i] < val.data[i]; +} + +bool EcInt::IsLessThanI(EcInt& val) +{ + if ((data[4] >> 63) && !(val.data[4] >> 63)) + return true; + if (!(data[4] >> 63) && (val.data[4] >> 63)) + return false; + + int i = 4; + while (i >= 0) + { + if (data[i] != val.data[i]) + break; + i--; + } + if (i < 0) + return false; + return data[i] < val.data[i]; +} + +bool EcInt::IsEqual(EcInt& val) +{ + return memcmp(val.data, this->data, 40) == 0; +} + +bool EcInt::IsZero() +{ + return ((data[0] == 0) && (data[1] == 0) && (data[2] == 0) && (data[3] == 0) && (data[4] == 0)); +} + +void EcInt::AddModP(EcInt& val) +{ + Add(val); + if (!IsLessThanU(g_P)) + Sub(g_P); +} + +void EcInt::SubModP(EcInt& val) +{ + if (Sub(val)) + Add(g_P); +} + +//assume value < P +void EcInt::NegModP() +{ + Neg(); + Add(g_P); +} + +//assume value < N +void EcInt::NegModN() +{ + Neg(); + Add(g_N); +} + +void EcInt::ShiftRight(int nbits) +{ + int offset = nbits / 64; + if (offset) + { + for (int i = 0; i < 5 - offset; i++) + data[i] = data[i + offset]; + for (int i = 5 - offset; i < 5; i++) + data[i] = 0; + nbits -= 64 * offset; + } + data[0] = __shiftright128(data[0], data[1], nbits); + data[1] = __shiftright128(data[1], data[2], nbits); + data[2] = __shiftright128(data[2], data[3], nbits); + data[3] = __shiftright128(data[3], data[4], nbits); + data[4] = ((i64)data[4]) >> nbits; +} + +void EcInt::ShiftLeft(int nbits) +{ + int offset = nbits / 64; + if (offset) + { + for (int i = 4; i >= offset; i--) + data[i] = data[i - offset]; + for (int i = offset - 1; i >= 0; i--) + data[i] = 0; + nbits -= 64 * offset; + } + data[4] = __shiftleft128(data[3], data[4], nbits); + data[3] = __shiftleft128(data[2], data[3], nbits); + data[2] = __shiftleft128(data[1], data[2], nbits); + data[1] = __shiftleft128(data[0], data[1], nbits); + data[0] = data[0] << nbits; +} + +void EcInt::MulModP(EcInt& val) +{ + u64 buff[8], tmp[5], h; + //calc 512 bits + Mul256_by_64(val.data, data[0], buff); + Mul256_by_64(val.data, data[1], tmp); + Add320_to_256(buff + 1, tmp); + Mul256_by_64(val.data, data[2], tmp); + Add320_to_256(buff + 2, tmp); + Mul256_by_64(val.data, data[3], tmp); + Add320_to_256(buff + 3, tmp); + //fast mod P + Mul256_by_64(buff + 4, P_REV, tmp); + u8 c = _addcarry_u64(0, buff[0], tmp[0], buff); + c = _addcarry_u64(c, buff[1], tmp[1], buff + 1); + c = _addcarry_u64(c, buff[2], tmp[2], buff + 2); + tmp[4] += _addcarry_u64(c, buff[3], tmp[3], buff + 3); + c = _addcarry_u64(0, buff[0], _umul128(tmp[4], P_REV, &h), data); + c = _addcarry_u64(c, buff[1], h, data + 1); + c = _addcarry_u64(c, 0, buff[2], data + 2); + data[4] = _addcarry_u64(c, buff[3], 0, data + 3); +} + +void EcInt::Mul_u64(EcInt& val, u64 multiplier) +{ + Assign(val); + Mul320_by_64(data, (u64)multiplier, data); +} + +void EcInt::Mul_i64(EcInt& val, i64 multiplier) +{ + Assign(val); + if (multiplier < 0) + { + Neg(); + multiplier = -multiplier; + } + Mul320_by_64(data, (u64)multiplier, data); +} + +#define APPLY_DIV_SHIFT() kbnt -= index; val >>= index; matrix[0] <<= index; matrix[1] <<= index; + +// https://tches.iacr.org/index.php/TCHES/article/download/8298/7648/4494 +//a bit tricky +void DIV_62(i64& kbnt, i64 modp, i64 val, i64* matrix) +{ + int index, cnt; + _BitScanForward64((DWORD*)&index, val | 0x4000000000000000); + APPLY_DIV_SHIFT(); + cnt = 62 - index; + while (cnt > 0) + { + if (kbnt < 0) + { + kbnt = -kbnt; + i64 tmp = -modp; modp = val; val = tmp; + tmp = -matrix[0]; matrix[0] = matrix[2]; matrix[2] = tmp; + tmp = -matrix[1]; matrix[1] = matrix[3]; matrix[3] = tmp; + } + int thr = cnt; + if ((kbnt + 1) < cnt) + thr = (int)(kbnt + 1); + i64 mul = (-modp * val) & ((UINT64_MAX >> (64 - thr)) & 0x07); + val += (modp * mul); + matrix[2] += (matrix[0] * mul); + matrix[3] += (matrix[1] * mul); + _BitScanForward64((DWORD*)&index, val | (1ull << cnt)); + APPLY_DIV_SHIFT(); + cnt -= index; + } +} + +void EcInt::InvModP() +{ + i64 matrix[4]; + EcInt result, a, tmp, tmp2; + EcInt modp, val; + i64 kbnt = -1; + matrix[1] = matrix[2] = 0; + matrix[0] = matrix[3] = 1; + DIV_62(kbnt, g_P.data[0], data[0], matrix); + modp.Mul_i64(g_P, matrix[0]); + tmp.Mul_i64(*this, matrix[1]); + modp.Add(tmp); + modp.ShiftRight(62); + val.Mul_i64(g_P, matrix[2]); + tmp.Mul_i64(*this, matrix[3]); + val.Add(tmp); + val.ShiftRight(62); + if (matrix[1] >= 0) + result.Set(matrix[1]); + else + { + result.Set(-matrix[1]); + result.Neg(); + } + if (matrix[3] >= 0) + a.Set(matrix[3]); + else + { + a.Set(-matrix[3]); + a.Neg(); + } + Mul320_by_64(g_P.data, (result.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp.data); + result.Add(tmp); + result.ShiftRight(62); + Mul320_by_64(g_P.data, (a.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp.data); + a.Add(tmp); + a.ShiftRight(62); + + while (val.data[0] || val.data[1] || val.data[2] || val.data[3]) + { + matrix[1] = matrix[2] = 0; + matrix[0] = matrix[3] = 1; + DIV_62(kbnt, modp.data[0], val.data[0], matrix); + tmp.Mul_i64(modp, matrix[0]); + tmp2.Mul_i64(val, matrix[1]); + tmp.Add(tmp2); + tmp2.Mul_i64(val, matrix[3]); + val.Mul_i64(modp, matrix[2]); + val.Add(tmp2); + val.ShiftRight(62); + modp = tmp; + modp.ShiftRight(62); + tmp.Mul_i64(result, matrix[0]); + tmp2.Mul_i64(a, matrix[1]); + tmp.Add(tmp2); + tmp2.Mul_i64(a, matrix[3]); + a.Mul_i64(result, matrix[2]); + a.Add(tmp2); + Mul320_by_64(g_P.data, (a.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp2.data); + a.Add(tmp2); + a.ShiftRight(62); + Mul320_by_64(g_P.data, (tmp.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp2.data); + result = tmp; + result.Add(tmp2); + result.ShiftRight(62); + } + Assign(result); + if (modp.data[4] >> 63) + { + Neg(); + modp.Neg(); + } + + if (modp.data[0] == 1) + { + if (data[4] >> 63) + Add(g_P); + if (data[4] >> 63) + Add(g_P); + if (!IsLessThanU(g_P)) + Sub(g_P); + if (!IsLessThanU(g_P)) + Sub(g_P); + } + else + SetZero(); //error +} + +// x = a^ { (p + 1) / 4 } mod p +void EcInt::SqrtModP() +{ + EcInt one, res; + one.Set(1); + EcInt exp = g_P; + exp.Add(one); + exp.ShiftRight(2); + res.Set(1); + EcInt cur = *this; + while (!exp.IsZero()) + { + if (exp.data[0] & 1) + res.MulModP(cur); + EcInt tmp = cur; + tmp.MulModP(cur); + cur = tmp; + exp.ShiftRight(1); + } + *this = res; +} + +std::mt19937_64 rng; +CriticalSection cs_rnd; + +void SetRndSeed(u64 seed) +{ + rng.seed(seed); +} + +void EcInt::RndBits(int nbits) +{ + SetZero(); + if (nbits > 256) + nbits = 256; + cs_rnd.Enter(); + for (int i = 0; i < (nbits + 63) / 64; i++) + data[i] = rng(); + cs_rnd.Leave(); + data[nbits / 64] &= (1ull << (nbits % 64)) - 1; +} + +//up to 256 bits only +void EcInt::RndMax(EcInt& max) +{ + SetZero(); + int n = 3; + while ((n >= 0) && !max.data[n]) + n--; + if (n < 0) + return; + u64 val = max.data[n]; + int k = 0; + while ((val & 0x8000000000000000) == 0) + { + val <<= 1; + k++; + } + int bits = 64 * n + (64 - k); + RndBits(bits); + while (!IsLessThanU(max)) // :) + RndBits(bits); +} + + +// ============================================================================ +// Extensión: w-NAF (w=4) para multiplicación escalar de G en CPU +// - Implementa una versión canónica de w-NAF usando las primitivas afines +// ya existentes (AddPoints / DoublePoint). No toca el resto del proyecto. +// ============================================================================ + +namespace { + static inline bool limb_is_zero(const u64 a[4]) { + return (a[0]|a[1]|a[2]|a[3])==0ull; + } + static inline bool limb_is_odd(const u64 a[4]) { + return (a[0] & 1ull) != 0ull; + } + static inline void limb_sub_small(u64 a[4], unsigned v) { + unsigned long long b = v; + unsigned __int128 t = (unsigned __int128)a[0] - b; + a[0] = (u64)t; + unsigned __int128 borrow = (t >> 127) & 1; + for (int i=1;i<4 && borrow;i++) { + unsigned __int128 t2 = (unsigned __int128)a[i] - 1; + a[i] = (u64)t2; + borrow = (t2 >> 127) & 1; + } + } + static inline void limb_add_small(u64 a[4], unsigned v) { + unsigned __int128 t = (unsigned __int128)a[0] + (unsigned)v; + a[0] = (u64)t; + u64 carry = (u64)(t >> 64); + for (int i=1;i<4 && carry;i++) { + unsigned __int128 t2 = (unsigned __int128)a[i] + 1; + a[i] = (u64)t2; + carry = (u64)(t2 >> 64); + } + } + static inline void limb_shr1(u64 a[4]) { + u64 c = 0; + for (int i=3;i>=0;i--) { + u64 n = a[i]; + a[i] = (n >> 1) | (c << 63); + c = n & 1ull; + } + } +} // anonymous + +EcPoint EcEx::MultiplyG_WNAF4(const EcInt &k) { + // Precompute odd multiples of G: [1,3,5,7,9,11,13,15]G + EcPoint pre[8]; + pre[0] = g_G; + EcPoint twoG = Ec::DoublePoint(g_G); + for (int i=1;i<8;i++) pre[i] = Ec::AddPoints(pre[i-1], twoG); + + // Convert k to a 256-bit limb array + u64 kk[4] = {k.data[0], k.data[1], k.data[2], k.data[3]}; + + // w-NAF digit expansion + int digits[300]; int nd=0; + while (!limb_is_zero(kk)) { + int ui = 0; + if (limb_is_odd(kk)) { + int mod16 = (int)(kk[0] & 15ull); + if (mod16 > 8) ui = mod16 - 16; else ui = mod16; + if (ui >= 0) limb_sub_small(kk, (unsigned)ui); + else limb_add_small(kk, (unsigned)(-ui)); + } + digits[nd++] = ui; + limb_shr1(kk); + } + + // Evaluate + bool R_is_inf = true; + EcPoint R; // undefined if inf + for (int i=nd-1;i>=0;i--) { + if (!R_is_inf) R = Ec::DoublePoint(R); + int ui = digits[i]; + if (ui != 0) { + EcPoint add = pre[(abs(ui)-1)>>1]; + if (ui < 0) { add.y.NegModP(); } + if (R_is_inf) { R = add; R_is_inf = false; } + else { R = Ec::AddPoints(R, add); } + } + } + if (R_is_inf) return EcPoint(); // zero + return R; +} diff --git a/Ec.h b/Ec.h index fa123e3..3018dd7 100644 --- a/Ec.h +++ b/Ec.h @@ -1,77 +1,80 @@ -// This file is a part of RCKangaroo software -// (c) 2024, RetiredCoder (RC) -// License: GPLv3, see "LICENSE.TXT" file -// https://github.com/RetiredC/Kang-2 - - -#pragma once - -#include "defs.h" -#include "utils.h" - -class EcInt -{ -public: - EcInt(); - - void Assign(EcInt& val); - void Set(u64 val); - void SetZero(); - bool SetHexStr(const char* str); - void GetHexStr(char* str); - u16 GetU16(int index); - - bool Add(EcInt& val); //returns true if carry - bool Sub(EcInt& val); //returns true if carry - void Neg(); - void Neg256(); - void ShiftRight(int nbits); - void ShiftLeft(int nbits); - bool IsLessThanU(EcInt& val); - bool IsLessThanI(EcInt& val); - bool IsEqual(EcInt& val); - bool IsZero(); - - void Mul_u64(EcInt& val, u64 multiplier); - void Mul_i64(EcInt& val, i64 multiplier); - - void AddModP(EcInt& val); - void SubModP(EcInt& val); - void NegModP(); - void MulModP(EcInt& val); - void InvModP(); - void SqrtModP(); - - void RndBits(int nbits); - void RndMax(EcInt& max); - - u64 data[4 + 1]; -}; - -class EcPoint -{ -public: - bool IsEqual(EcPoint& pnt); - void LoadFromBuffer64(u8* buffer); - void SaveToBuffer64(u8* buffer); - bool SetHexStr(const char* str); - EcInt x; - EcInt y; -}; - -class Ec -{ -public: - static EcPoint AddPoints(EcPoint& pnt1, EcPoint& pnt2); - static EcPoint DoublePoint(EcPoint& pnt); - static EcPoint MultiplyG(EcInt& k); -#ifdef DEBUG_MODE - static EcPoint MultiplyG_Fast(EcInt& k); -#endif - static EcInt CalcY(EcInt& x, bool is_even); - static bool IsValidPoint(EcPoint& pnt); -}; - -void InitEc(); -void DeInitEc(); -void SetRndSeed(u64 seed); \ No newline at end of file +// This file is a part of RCKangaroo software +// (c) 2024, RetiredCoder (RC) +// License: GPLv3, see "LICENSE.TXT" file +// https://github.com/RetiredC/Kang-2 + + +#pragma once + +#include "defs.h" +#include "utils.h" + +class EcInt +{ +public: + EcInt(); + + void Assign(EcInt& val); + void Set(u64 val); + void SetZero(); + bool SetHexStr(const char* str); + void GetHexStr(char* str); + u16 GetU16(int index); + + bool Add(EcInt& val); //returns true if carry + bool Sub(EcInt& val); //returns true if carry + void Neg(); + void Neg256(); + void ShiftRight(int nbits); + void ShiftLeft(int nbits); + bool IsLessThanU(EcInt& val); + bool IsLessThanI(EcInt& val); + bool IsEqual(EcInt& val); + bool IsZero(); + + void Mul_u64(EcInt& val, u64 multiplier); + void Mul_i64(EcInt& val, i64 multiplier); + + void AddModP(EcInt& val); + void SubModP(EcInt& val); + void NegModP(); + void NegModN(); + void MulModP(EcInt& val); + void InvModP(); + void SqrtModP(); + + void RndBits(int nbits); + void RndMax(EcInt& max); + + u64 data[4 + 1]; +}; + +class EcPoint +{ +public: + bool IsEqual(EcPoint& pnt); + void LoadFromBuffer64(u8* buffer); + void SaveToBuffer64(u8* buffer); + bool SetHexStr(const char* str); + EcInt x; + EcInt y; +}; + +class Ec +{ +public: + static EcPoint AddPoints(EcPoint& pnt1, EcPoint& pnt2); + static EcPoint DoublePoint(EcPoint& pnt); + static EcPoint MultiplyG(EcInt& k); +#ifdef DEBUG_MODE + static EcPoint MultiplyG_Fast(EcInt& k); +#endif + static EcInt CalcY(EcInt& x, bool is_even); + static bool IsValidPoint(EcPoint& pnt); +}; + +void InitEc(); +void DeInitEc(); +void SetRndSeed(u64 seed); + +class EcEx { public: static EcPoint MultiplyG_WNAF4(const EcInt &k); }; diff --git a/GpuKang.cpp b/GpuKang.cpp index f792d8c..d9b1bcd 100644 --- a/GpuKang.cpp +++ b/GpuKang.cpp @@ -1,517 +1,522 @@ -// This file is a part of RCKangaroo software -// (c) 2024, RetiredCoder (RC) -// License: GPLv3, see "LICENSE.TXT" file -// https://github.com/RetiredC - - -#include -#include "cuda_runtime.h" -#include "cuda.h" - -#include "GpuKang.h" - -cudaError_t cuSetGpuParams(TKparams Kparams, u64* _jmp2_table); -void CallGpuKernelGen(TKparams Kparams); -void CallGpuKernelABC(TKparams Kparams); -void AddPointsToList(u32* data, int cnt, u64 ops_cnt); -extern bool gGenMode; //tames generation mode - -int RCGpuKang::CalcKangCnt() -{ - Kparams.BlockCnt = mpCnt; - Kparams.BlockSize = IsOldGpu ? 512 : 256; - Kparams.GroupCnt = IsOldGpu ? 64 : 24; - return Kparams.BlockSize* Kparams.GroupCnt* Kparams.BlockCnt; -} - -//executes in main thread -bool RCGpuKang::Prepare(EcPoint _PntToSolve, int _Range, int _DP, EcJMP* _EcJumps1, EcJMP* _EcJumps2, EcJMP* _EcJumps3) -{ - PntToSolve = _PntToSolve; - Range = _Range; - DP = _DP; - EcJumps1 = _EcJumps1; - EcJumps2 = _EcJumps2; - EcJumps3 = _EcJumps3; - StopFlag = false; - Failed = false; - u64 total_mem = 0; - memset(dbg, 0, sizeof(dbg)); - memset(SpeedStats, 0, sizeof(SpeedStats)); - cur_stats_ind = 0; - - cudaError_t err; - err = cudaSetDevice(CudaIndex); - if (err != cudaSuccess) - return false; - - Kparams.BlockCnt = mpCnt; - Kparams.BlockSize = IsOldGpu ? 512 : 256; - Kparams.GroupCnt = IsOldGpu ? 64 : 24; - KangCnt = Kparams.BlockSize * Kparams.GroupCnt * Kparams.BlockCnt; - Kparams.KangCnt = KangCnt; - Kparams.DP = DP; - Kparams.KernelA_LDS_Size = 64 * JMP_CNT + 16 * Kparams.BlockSize; - Kparams.KernelB_LDS_Size = 64 * JMP_CNT; - Kparams.KernelC_LDS_Size = 96 * JMP_CNT; - Kparams.IsGenMode = gGenMode; - -//allocate gpu mem - u64 size; - if (!IsOldGpu) - { - //L2 - int L2size = Kparams.KangCnt * (3 * 32); - total_mem += L2size; - err = cudaMalloc((void**)&Kparams.L2, L2size); - if (err != cudaSuccess) - { - printf("GPU %d, Allocate L2 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - size = L2size; - if (size > persistingL2CacheMaxSize) - size = persistingL2CacheMaxSize; - err = cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, size); // set max allowed size for L2 - //persisting for L2 - cudaStreamAttrValue stream_attribute; - stream_attribute.accessPolicyWindow.base_ptr = Kparams.L2; - stream_attribute.accessPolicyWindow.num_bytes = size; - stream_attribute.accessPolicyWindow.hitRatio = 1.0; - stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; - stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming; - err = cudaStreamSetAttribute(NULL, cudaStreamAttributeAccessPolicyWindow, &stream_attribute); - if (err != cudaSuccess) - { - printf("GPU %d, cudaStreamSetAttribute failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - } - size = MAX_DP_CNT * GPU_DP_SIZE + 16; - total_mem += size; - err = cudaMalloc((void**)&Kparams.DPs_out, size); - if (err != cudaSuccess) - { - printf("GPU %d Allocate GpuOut memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - size = KangCnt * 96; - total_mem += size; - err = cudaMalloc((void**)&Kparams.Kangs, size); - if (err != cudaSuccess) - { - printf("GPU %d Allocate pKangs memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - total_mem += JMP_CNT * 96; - err = cudaMalloc((void**)&Kparams.Jumps1, JMP_CNT * 96); - if (err != cudaSuccess) - { - printf("GPU %d Allocate Jumps1 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - total_mem += JMP_CNT * 96; - err = cudaMalloc((void**)&Kparams.Jumps2, JMP_CNT * 96); - if (err != cudaSuccess) - { - printf("GPU %d Allocate Jumps1 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - total_mem += JMP_CNT * 96; - err = cudaMalloc((void**)&Kparams.Jumps3, JMP_CNT * 96); - if (err != cudaSuccess) - { - printf("GPU %d Allocate Jumps3 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - size = 2 * (u64)KangCnt * STEP_CNT; - total_mem += size; - err = cudaMalloc((void**)&Kparams.JumpsList, size); - if (err != cudaSuccess) - { - printf("GPU %d Allocate JumpsList memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - size = (u64)KangCnt * (16 * DPTABLE_MAX_CNT + sizeof(u32)); //we store 16bytes of X - total_mem += size; - err = cudaMalloc((void**)&Kparams.DPTable, size); - if (err != cudaSuccess) - { - printf("GPU %d Allocate DPTable memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - size = mpCnt * Kparams.BlockSize * sizeof(u64); - total_mem += size; - err = cudaMalloc((void**)&Kparams.L1S2, size); - if (err != cudaSuccess) - { - printf("GPU %d Allocate L1S2 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - size = (u64)KangCnt * MD_LEN * (2 * 32); - total_mem += size; - err = cudaMalloc((void**)&Kparams.LastPnts, size); - if (err != cudaSuccess) - { - printf("GPU %d Allocate LastPnts memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - size = (u64)KangCnt * MD_LEN * sizeof(u64); - total_mem += size; - err = cudaMalloc((void**)&Kparams.LoopTable, size); - if (err != cudaSuccess) - { - printf("GPU %d Allocate LastPnts memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - total_mem += 1024; - err = cudaMalloc((void**)&Kparams.dbg_buf, 1024); - if (err != cudaSuccess) - { - printf("GPU %d Allocate dbg_buf memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - size = sizeof(u32) * KangCnt + 8; - total_mem += size; - err = cudaMalloc((void**)&Kparams.LoopedKangs, size); - if (err != cudaSuccess) - { - printf("GPU %d Allocate LoopedKangs memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - - DPs_out = (u32*)malloc(MAX_DP_CNT * GPU_DP_SIZE); - -//jmp1 - u64* buf = (u64*)malloc(JMP_CNT * 96); - for (int i = 0; i < JMP_CNT; i++) - { - memcpy(buf + i * 12, EcJumps1[i].p.x.data, 32); - memcpy(buf + i * 12 + 4, EcJumps1[i].p.y.data, 32); - memcpy(buf + i * 12 + 8, EcJumps1[i].dist.data, 32); - } - err = cudaMemcpy(Kparams.Jumps1, buf, JMP_CNT * 96, cudaMemcpyHostToDevice); - if (err != cudaSuccess) - { - printf("GPU %d, cudaMemcpy Jumps1 failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - free(buf); -//jmp2 - buf = (u64*)malloc(JMP_CNT * 96); - u64* jmp2_table = (u64*)malloc(JMP_CNT * 64); - for (int i = 0; i < JMP_CNT; i++) - { - memcpy(buf + i * 12, EcJumps2[i].p.x.data, 32); - memcpy(jmp2_table + i * 8, EcJumps2[i].p.x.data, 32); - memcpy(buf + i * 12 + 4, EcJumps2[i].p.y.data, 32); - memcpy(jmp2_table + i * 8 + 4, EcJumps2[i].p.y.data, 32); - memcpy(buf + i * 12 + 8, EcJumps2[i].dist.data, 32); - } - err = cudaMemcpy(Kparams.Jumps2, buf, JMP_CNT * 96, cudaMemcpyHostToDevice); - if (err != cudaSuccess) - { - printf("GPU %d, cudaMemcpy Jumps2 failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - free(buf); - - err = cuSetGpuParams(Kparams, jmp2_table); - if (err != cudaSuccess) - { - free(jmp2_table); - printf("GPU %d, cuSetGpuParams failed: %s!\r\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - free(jmp2_table); -//jmp3 - buf = (u64*)malloc(JMP_CNT * 96); - for (int i = 0; i < JMP_CNT; i++) - { - memcpy(buf + i * 12, EcJumps3[i].p.x.data, 32); - memcpy(buf + i * 12 + 4, EcJumps3[i].p.y.data, 32); - memcpy(buf + i * 12 + 8, EcJumps3[i].dist.data, 32); - } - err = cudaMemcpy(Kparams.Jumps3, buf, JMP_CNT * 96, cudaMemcpyHostToDevice); - if (err != cudaSuccess) - { - printf("GPU %d, cudaMemcpy Jumps3 failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - free(buf); - - printf("GPU %d: allocated %llu MB, %d kangaroos. OldGpuMode: %s\r\n", CudaIndex, total_mem / (1024 * 1024), KangCnt, IsOldGpu ? "Yes" : "No"); - return true; -} - -void RCGpuKang::Release() -{ - free(RndPnts); - free(DPs_out); - cudaFree(Kparams.LoopedKangs); - cudaFree(Kparams.dbg_buf); - cudaFree(Kparams.LoopTable); - cudaFree(Kparams.LastPnts); - cudaFree(Kparams.L1S2); - cudaFree(Kparams.DPTable); - cudaFree(Kparams.JumpsList); - cudaFree(Kparams.Jumps3); - cudaFree(Kparams.Jumps2); - cudaFree(Kparams.Jumps1); - cudaFree(Kparams.Kangs); - cudaFree(Kparams.DPs_out); - if (!IsOldGpu) - cudaFree(Kparams.L2); -} - -void RCGpuKang::Stop() -{ - StopFlag = true; -} - -void RCGpuKang::GenerateRndDistances() -{ - for (int i = 0; i < KangCnt; i++) - { - EcInt d; - if (i < KangCnt / 3) - d.RndBits(Range - 4); //TAME kangs - else - { - d.RndBits(Range - 1); - d.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even - } - memcpy(RndPnts[i].priv, d.data, 24); - } -} - -bool RCGpuKang::Start() -{ - if (Failed) - return false; - - cudaError_t err; - err = cudaSetDevice(CudaIndex); - if (err != cudaSuccess) - return false; - - HalfRange.Set(1); - HalfRange.ShiftLeft(Range - 1); - PntHalfRange = ec.MultiplyG(HalfRange); - NegPntHalfRange = PntHalfRange; - NegPntHalfRange.y.NegModP(); - - PntA = ec.AddPoints(PntToSolve, NegPntHalfRange); - PntB = PntA; - PntB.y.NegModP(); - - RndPnts = (TPointPriv*)malloc(KangCnt * 96); - GenerateRndDistances(); -/* - //we can calc start points on CPU - for (int i = 0; i < KangCnt; i++) - { - EcInt d; - memcpy(d.data, RndPnts[i].priv, 24); - d.data[3] = 0; - d.data[4] = 0; - EcPoint p = ec.MultiplyG(d); - memcpy(RndPnts[i].x, p.x.data, 32); - memcpy(RndPnts[i].y, p.y.data, 32); - } - for (int i = KangCnt / 3; i < 2 * KangCnt / 3; i++) - { - EcPoint p; - p.LoadFromBuffer64((u8*)RndPnts[i].x); - p = ec.AddPoints(p, PntA); - p.SaveToBuffer64((u8*)RndPnts[i].x); - } - for (int i = 2 * KangCnt / 3; i < KangCnt; i++) - { - EcPoint p; - p.LoadFromBuffer64((u8*)RndPnts[i].x); - p = ec.AddPoints(p, PntB); - p.SaveToBuffer64((u8*)RndPnts[i].x); - } - //copy to gpu - err = cudaMemcpy(Kparams.Kangs, RndPnts, KangCnt * 96, cudaMemcpyHostToDevice); - if (err != cudaSuccess) - { - printf("GPU %d, cudaMemcpy failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } -/**/ - //but it's faster to calc then on GPU - u8 buf_PntA[64], buf_PntB[64]; - PntA.SaveToBuffer64(buf_PntA); - PntB.SaveToBuffer64(buf_PntB); - for (int i = 0; i < KangCnt; i++) - { - if (i < KangCnt / 3) - memset(RndPnts[i].x, 0, 64); - else - if (i < 2 * KangCnt / 3) - memcpy(RndPnts[i].x, buf_PntA, 64); - else - memcpy(RndPnts[i].x, buf_PntB, 64); - } - //copy to gpu - err = cudaMemcpy(Kparams.Kangs, RndPnts, KangCnt * 96, cudaMemcpyHostToDevice); - if (err != cudaSuccess) - { - printf("GPU %d, cudaMemcpy failed: %s\n", CudaIndex, cudaGetErrorString(err)); - return false; - } - CallGpuKernelGen(Kparams); - - err = cudaMemset(Kparams.L1S2, 0, mpCnt * Kparams.BlockSize * 8); - if (err != cudaSuccess) - return false; - cudaMemset(Kparams.dbg_buf, 0, 1024); - cudaMemset(Kparams.LoopTable, 0, KangCnt * MD_LEN * sizeof(u64)); - return true; -} - -#ifdef DEBUG_MODE -int RCGpuKang::Dbg_CheckKangs() -{ - int kang_size = mpCnt * Kparams.BlockSize * Kparams.GroupCnt * 96; - u64* kangs = (u64*)malloc(kang_size); - cudaError_t err = cudaMemcpy(kangs, Kparams.Kangs, kang_size, cudaMemcpyDeviceToHost); - int res = 0; - for (int i = 0; i < KangCnt; i++) - { - EcPoint Pnt, p; - Pnt.LoadFromBuffer64((u8*)&kangs[i * 12 + 0]); - EcInt dist; - dist.Set(0); - memcpy(dist.data, &kangs[i * 12 + 8], 24); - bool neg = false; - if (dist.data[2] >> 63) - { - neg = true; - memset(((u8*)dist.data) + 24, 0xFF, 16); - dist.Neg(); - } - p = ec.MultiplyG_Fast(dist); - if (neg) - p.y.NegModP(); - if (i < KangCnt / 3) - p = p; - else - if (i < 2 * KangCnt / 3) - p = ec.AddPoints(PntA, p); - else - p = ec.AddPoints(PntB, p); - if (!p.IsEqual(Pnt)) - res++; - } - free(kangs); - return res; -} -#endif - -extern u32 gTotalErrors; - -//executes in separate thread -void RCGpuKang::Execute() -{ - cudaSetDevice(CudaIndex); - - if (!Start()) - { - gTotalErrors++; - return; - } -#ifdef DEBUG_MODE - u64 iter = 1; -#endif - cudaError_t err; - while (!StopFlag) - { - u64 t1 = GetTickCount64(); - cudaMemset(Kparams.DPs_out, 0, 4); - cudaMemset(Kparams.DPTable, 0, KangCnt * sizeof(u32)); - cudaMemset(Kparams.LoopedKangs, 0, 8); - CallGpuKernelABC(Kparams); - int cnt; - err = cudaMemcpy(&cnt, Kparams.DPs_out, 4, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) - { - printf("GPU %d, CallGpuKernel failed: %s\r\n", CudaIndex, cudaGetErrorString(err)); - gTotalErrors++; - break; - } - - if (cnt >= MAX_DP_CNT) - { - cnt = MAX_DP_CNT; - printf("GPU %d, gpu DP buffer overflow, some points lost, increase DP value!\r\n", CudaIndex); - } - u64 pnt_cnt = (u64)KangCnt * STEP_CNT; - - if (cnt) - { - err = cudaMemcpy(DPs_out, Kparams.DPs_out + 4, cnt * GPU_DP_SIZE, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) - { - gTotalErrors++; - break; - } - AddPointsToList(DPs_out, cnt, (u64)KangCnt * STEP_CNT); - } - - //dbg - cudaMemcpy(dbg, Kparams.dbg_buf, 1024, cudaMemcpyDeviceToHost); - - u32 lcnt; - cudaMemcpy(&lcnt, Kparams.LoopedKangs, 4, cudaMemcpyDeviceToHost); - //printf("GPU %d, Looped: %d\r\n", CudaIndex, lcnt); - - u64 t2 = GetTickCount64(); - u64 tm = t2 - t1; - if (!tm) - tm = 1; - int cur_speed = (int)(pnt_cnt / (tm * 1000)); - //printf("GPU %d kernel time %d ms, speed %d MH\r\n", CudaIndex, (int)tm, cur_speed); - - SpeedStats[cur_stats_ind] = cur_speed; - cur_stats_ind = (cur_stats_ind + 1) % STATS_WND_SIZE; - -#ifdef DEBUG_MODE - if ((iter % 300) == 0) - { - int corr_cnt = Dbg_CheckKangs(); - if (corr_cnt) - { - printf("DBG: GPU %d, KANGS CORRUPTED: %d\r\n", CudaIndex, corr_cnt); - gTotalErrors++; - } - else - printf("DBG: GPU %d, ALL KANGS OK!\r\n", CudaIndex); - } - iter++; -#endif - } - - Release(); -} - -int RCGpuKang::GetStatsSpeed() -{ - int res = SpeedStats[0]; - for (int i = 1; i < STATS_WND_SIZE; i++) - res += SpeedStats[i]; - return res / STATS_WND_SIZE; +// This file is a part of RCKangaroo software +// (c) 2024, RetiredCoder (RC) +// License: GPLv3, see "LICENSE.TXT" file +// https://github.com/RetiredC + + +#include +#include "cuda_runtime.h" +#include "cuda.h" + +#include "GpuKang.h" +extern int gTameRatioPct; +extern int gTameBitsOffset; + +cudaError_t cuSetGpuParams(TKparams Kparams, u64* _jmp2_table); +void CallGpuKernelGen(TKparams Kparams); +void CallGpuKernelABC(TKparams Kparams); +void AddPointsToList(u32* data, int cnt, u64 ops_cnt); +extern bool gGenMode; //tames generation mode + +int RCGpuKang::CalcKangCnt() +{ + Kparams.BlockCnt = mpCnt; + Kparams.BlockSize = IsOldGpu ? 512 : 256; + Kparams.GroupCnt = IsOldGpu ? 64 : 24; + return Kparams.BlockSize* Kparams.GroupCnt* Kparams.BlockCnt; +} + +//executes in main thread +bool RCGpuKang::Prepare(EcPoint _PntToSolve, int _Range, int _DP, EcJMP* _EcJumps1, EcJMP* _EcJumps2, EcJMP* _EcJumps3) +{ + PntToSolve = _PntToSolve; + Range = _Range; + DP = _DP; + EcJumps1 = _EcJumps1; + EcJumps2 = _EcJumps2; + EcJumps3 = _EcJumps3; + StopFlag = false; + Failed = false; + u64 total_mem = 0; + memset(dbg, 0, sizeof(dbg)); + memset(SpeedStats, 0, sizeof(SpeedStats)); + cur_stats_ind = 0; + + cudaError_t err; + err = cudaSetDevice(CudaIndex); + if (err != cudaSuccess) + return false; + + Kparams.BlockCnt = mpCnt; + Kparams.BlockSize = IsOldGpu ? 512 : 256; + Kparams.GroupCnt = IsOldGpu ? 64 : 24; + KangCnt = Kparams.BlockSize * Kparams.GroupCnt * Kparams.BlockCnt; + Kparams.KangCnt = KangCnt; + Kparams.DP = DP; + Kparams.KernelA_LDS_Size = 64 * JMP_CNT + 16 * Kparams.BlockSize; + Kparams.KernelB_LDS_Size = 64 * JMP_CNT; + Kparams.KernelC_LDS_Size = 96 * JMP_CNT; + Kparams.IsGenMode = gGenMode; + +//allocate gpu mem + u64 size; + if (!IsOldGpu) + { + //L2 + int L2size = Kparams.KangCnt * (3 * 32); + total_mem += L2size; + err = cudaMalloc((void**)&Kparams.L2, L2size); + if (err != cudaSuccess) + { + printf("GPU %d, Allocate L2 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + size = L2size; + if (size > persistingL2CacheMaxSize) + size = persistingL2CacheMaxSize; + err = cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, size); // set max allowed size for L2 + //persisting for L2 + cudaStreamAttrValue stream_attribute; + stream_attribute.accessPolicyWindow.base_ptr = Kparams.L2; + stream_attribute.accessPolicyWindow.num_bytes = size; + stream_attribute.accessPolicyWindow.hitRatio = 1.0; + stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; + stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming; + err = cudaStreamSetAttribute(NULL, cudaStreamAttributeAccessPolicyWindow, &stream_attribute); + if (err != cudaSuccess) + { + printf("GPU %d, cudaStreamSetAttribute failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + } + size = MAX_DP_CNT * GPU_DP_SIZE + 16; + total_mem += size; + err = cudaMalloc((void**)&Kparams.DPs_out, size); + if (err != cudaSuccess) + { + printf("GPU %d Allocate GpuOut memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + size = KangCnt * 96; + total_mem += size; + err = cudaMalloc((void**)&Kparams.Kangs, size); + if (err != cudaSuccess) + { + printf("GPU %d Allocate pKangs memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + total_mem += JMP_CNT * 96; + err = cudaMalloc((void**)&Kparams.Jumps1, JMP_CNT * 96); + if (err != cudaSuccess) + { + printf("GPU %d Allocate Jumps1 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + total_mem += JMP_CNT * 96; + err = cudaMalloc((void**)&Kparams.Jumps2, JMP_CNT * 96); + if (err != cudaSuccess) + { + printf("GPU %d Allocate Jumps1 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + total_mem += JMP_CNT * 96; + err = cudaMalloc((void**)&Kparams.Jumps3, JMP_CNT * 96); + if (err != cudaSuccess) + { + printf("GPU %d Allocate Jumps3 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + size = 2 * (u64)KangCnt * STEP_CNT; + total_mem += size; + err = cudaMalloc((void**)&Kparams.JumpsList, size); + if (err != cudaSuccess) + { + printf("GPU %d Allocate JumpsList memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + size = (u64)KangCnt * (16 * DPTABLE_MAX_CNT + sizeof(u32)); //we store 16bytes of X + total_mem += size; + err = cudaMalloc((void**)&Kparams.DPTable, size); + if (err != cudaSuccess) + { + printf("GPU %d Allocate DPTable memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + size = mpCnt * Kparams.BlockSize * sizeof(u64); + total_mem += size; + err = cudaMalloc((void**)&Kparams.L1S2, size); + if (err != cudaSuccess) + { + printf("GPU %d Allocate L1S2 memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + size = (u64)KangCnt * MD_LEN * (2 * 32); + total_mem += size; + err = cudaMalloc((void**)&Kparams.LastPnts, size); + if (err != cudaSuccess) + { + printf("GPU %d Allocate LastPnts memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + size = (u64)KangCnt * MD_LEN * sizeof(u64); + total_mem += size; + err = cudaMalloc((void**)&Kparams.LoopTable, size); + if (err != cudaSuccess) + { + printf("GPU %d Allocate LastPnts memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + total_mem += 1024; + err = cudaMalloc((void**)&Kparams.dbg_buf, 1024); + if (err != cudaSuccess) + { + printf("GPU %d Allocate dbg_buf memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + size = sizeof(u32) * KangCnt + 8; + total_mem += size; + err = cudaMalloc((void**)&Kparams.LoopedKangs, size); + if (err != cudaSuccess) + { + printf("GPU %d Allocate LoopedKangs memory failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + + DPs_out = (u32*)malloc(MAX_DP_CNT * GPU_DP_SIZE); + +//jmp1 + u64* buf = (u64*)malloc(JMP_CNT * 96); + for (int i = 0; i < JMP_CNT; i++) + { + memcpy(buf + i * 12, EcJumps1[i].p.x.data, 32); + memcpy(buf + i * 12 + 4, EcJumps1[i].p.y.data, 32); + memcpy(buf + i * 12 + 8, EcJumps1[i].dist.data, 32); + } + err = cudaMemcpy(Kparams.Jumps1, buf, JMP_CNT * 96, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + printf("GPU %d, cudaMemcpy Jumps1 failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + free(buf); +//jmp2 + buf = (u64*)malloc(JMP_CNT * 96); + u64* jmp2_table = (u64*)malloc(JMP_CNT * 64); + for (int i = 0; i < JMP_CNT; i++) + { + memcpy(buf + i * 12, EcJumps2[i].p.x.data, 32); + memcpy(jmp2_table + i * 8, EcJumps2[i].p.x.data, 32); + memcpy(buf + i * 12 + 4, EcJumps2[i].p.y.data, 32); + memcpy(jmp2_table + i * 8 + 4, EcJumps2[i].p.y.data, 32); + memcpy(buf + i * 12 + 8, EcJumps2[i].dist.data, 32); + } + err = cudaMemcpy(Kparams.Jumps2, buf, JMP_CNT * 96, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + printf("GPU %d, cudaMemcpy Jumps2 failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + free(buf); + + err = cuSetGpuParams(Kparams, jmp2_table); + if (err != cudaSuccess) + { + free(jmp2_table); + printf("GPU %d, cuSetGpuParams failed: %s!\r\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + free(jmp2_table); +//jmp3 + buf = (u64*)malloc(JMP_CNT * 96); + for (int i = 0; i < JMP_CNT; i++) + { + memcpy(buf + i * 12, EcJumps3[i].p.x.data, 32); + memcpy(buf + i * 12 + 4, EcJumps3[i].p.y.data, 32); + memcpy(buf + i * 12 + 8, EcJumps3[i].dist.data, 32); + } + err = cudaMemcpy(Kparams.Jumps3, buf, JMP_CNT * 96, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + printf("GPU %d, cudaMemcpy Jumps3 failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + free(buf); + + printf("GPU %d: allocated %llu MB, %d kangaroos. OldGpuMode: %s\r\n", CudaIndex, total_mem / (1024 * 1024), KangCnt, IsOldGpu ? "Yes" : "No"); + return true; +} + +void RCGpuKang::Release() +{ + free(RndPnts); + free(DPs_out); + cudaFree(Kparams.LoopedKangs); + cudaFree(Kparams.dbg_buf); + cudaFree(Kparams.LoopTable); + cudaFree(Kparams.LastPnts); + cudaFree(Kparams.L1S2); + cudaFree(Kparams.DPTable); + cudaFree(Kparams.JumpsList); + cudaFree(Kparams.Jumps3); + cudaFree(Kparams.Jumps2); + cudaFree(Kparams.Jumps1); + cudaFree(Kparams.Kangs); + cudaFree(Kparams.DPs_out); + if (!IsOldGpu) + cudaFree(Kparams.L2); +} + +void RCGpuKang::Stop() +{ + StopFlag = true; +} + +void RCGpuKang::GenerateRndDistances() +{ + for (int i = 0; i < KangCnt; i++) + { + EcInt d; + int tameBorder = (KangCnt * gTameRatioPct) / 100; + int tameBits = Range - gTameBitsOffset; + if (tameBits < 1) tameBits = 1; + if (i < tameBorder) + d.RndBits(tameBits); // TAME kangs + else + { + d.RndBits(Range - 1); + d.data[0] &= 0xFFFFFFFFFFFFFFFE; // must be even + } + memcpy(RndPnts[i].priv, d.data, 24); + } +} + +bool RCGpuKang::Start() +{ + if (Failed) + return false; + + cudaError_t err; + err = cudaSetDevice(CudaIndex); + if (err != cudaSuccess) + return false; + + HalfRange.Set(1); + HalfRange.ShiftLeft(Range - 1); + PntHalfRange = ec.MultiplyG(HalfRange); + NegPntHalfRange = PntHalfRange; + NegPntHalfRange.y.NegModP(); + + PntA = ec.AddPoints(PntToSolve, NegPntHalfRange); + PntB = PntA; + PntB.y.NegModP(); + + RndPnts = (TPointPriv*)malloc(KangCnt * 96); + GenerateRndDistances(); +/* + //we can calc start points on CPU + for (int i = 0; i < KangCnt; i++) + { + EcInt d; + memcpy(d.data, RndPnts[i].priv, 24); + d.data[3] = 0; + d.data[4] = 0; + EcPoint p = ec.MultiplyG(d); + memcpy(RndPnts[i].x, p.x.data, 32); + memcpy(RndPnts[i].y, p.y.data, 32); + } + for (int i = KangCnt / 3; i < 2 * KangCnt / 3; i++) + { + EcPoint p; + p.LoadFromBuffer64((u8*)RndPnts[i].x); + p = ec.AddPoints(p, PntA); + p.SaveToBuffer64((u8*)RndPnts[i].x); + } + for (int i = 2 * KangCnt / 3; i < KangCnt; i++) + { + EcPoint p; + p.LoadFromBuffer64((u8*)RndPnts[i].x); + p = ec.AddPoints(p, PntB); + p.SaveToBuffer64((u8*)RndPnts[i].x); + } + //copy to gpu + err = cudaMemcpy(Kparams.Kangs, RndPnts, KangCnt * 96, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + printf("GPU %d, cudaMemcpy failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } +/**/ + //but it's faster to calc then on GPU + u8 buf_PntA[64], buf_PntB[64]; + PntA.SaveToBuffer64(buf_PntA); + PntB.SaveToBuffer64(buf_PntB); + for (int i = 0; i < KangCnt; i++) + { + if (i < KangCnt / 3) + memset(RndPnts[i].x, 0, 64); + else + if (i < 2 * KangCnt / 3) + memcpy(RndPnts[i].x, buf_PntA, 64); + else + memcpy(RndPnts[i].x, buf_PntB, 64); + } + //copy to gpu + err = cudaMemcpy(Kparams.Kangs, RndPnts, KangCnt * 96, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + printf("GPU %d, cudaMemcpy failed: %s\n", CudaIndex, cudaGetErrorString(err)); + return false; + } + CallGpuKernelGen(Kparams); + + err = cudaMemset(Kparams.L1S2, 0, mpCnt * Kparams.BlockSize * 8); + if (err != cudaSuccess) + return false; + cudaMemset(Kparams.dbg_buf, 0, 1024); + cudaMemset(Kparams.LoopTable, 0, KangCnt * MD_LEN * sizeof(u64)); + return true; +} + +#ifdef DEBUG_MODE +int RCGpuKang::Dbg_CheckKangs() +{ + int kang_size = mpCnt * Kparams.BlockSize * Kparams.GroupCnt * 96; + u64* kangs = (u64*)malloc(kang_size); + cudaError_t err = cudaMemcpy(kangs, Kparams.Kangs, kang_size, cudaMemcpyDeviceToHost); + int res = 0; + for (int i = 0; i < KangCnt; i++) + { + EcPoint Pnt, p; + Pnt.LoadFromBuffer64((u8*)&kangs[i * 12 + 0]); + EcInt dist; + dist.Set(0); + memcpy(dist.data, &kangs[i * 12 + 8], 24); + bool neg = false; + if (dist.data[2] >> 63) + { + neg = true; + memset(((u8*)dist.data) + 24, 0xFF, 16); + dist.Neg(); + } + p = ec.MultiplyG_Fast(dist); + if (neg) + p.y.NegModP(); + if (i < KangCnt / 3) + p = p; + else + if (i < 2 * KangCnt / 3) + p = ec.AddPoints(PntA, p); + else + p = ec.AddPoints(PntB, p); + if (!p.IsEqual(Pnt)) + res++; + } + free(kangs); + return res; +} +#endif + +extern u32 gTotalErrors; + +//executes in separate thread +void RCGpuKang::Execute() +{ + cudaSetDevice(CudaIndex); + + if (!Start()) + { + gTotalErrors++; + return; + } +#ifdef DEBUG_MODE + u64 iter = 1; +#endif + cudaError_t err; + while (!StopFlag) + { + u64 t1 = GetTickCount64(); + cudaMemset(Kparams.DPs_out, 0, 4); + cudaMemset(Kparams.DPTable, 0, KangCnt * sizeof(u32)); + cudaMemset(Kparams.LoopedKangs, 0, 8); + CallGpuKernelABC(Kparams); + int cnt; + err = cudaMemcpy(&cnt, Kparams.DPs_out, 4, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + printf("GPU %d, CallGpuKernel failed: %s\r\n", CudaIndex, cudaGetErrorString(err)); + gTotalErrors++; + break; + } + + if (cnt >= MAX_DP_CNT) + { + cnt = MAX_DP_CNT; + printf("GPU %d, gpu DP buffer overflow, some points lost, increase DP value!\r\n", CudaIndex); + } + u64 pnt_cnt = (u64)KangCnt * STEP_CNT; + + if (cnt) + { + err = cudaMemcpy(DPs_out, Kparams.DPs_out + 4, cnt * GPU_DP_SIZE, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + gTotalErrors++; + break; + } + AddPointsToList(DPs_out, cnt, (u64)KangCnt * STEP_CNT); + } + + //dbg + cudaMemcpy(dbg, Kparams.dbg_buf, 1024, cudaMemcpyDeviceToHost); + + u32 lcnt; + cudaMemcpy(&lcnt, Kparams.LoopedKangs, 4, cudaMemcpyDeviceToHost); + //printf("GPU %d, Looped: %d\r\n", CudaIndex, lcnt); + + u64 t2 = GetTickCount64(); + u64 tm = t2 - t1; + if (!tm) + tm = 1; + int cur_speed = (int)(pnt_cnt / (tm * 1000)); + //printf("GPU %d kernel time %d ms, speed %d MH\r\n", CudaIndex, (int)tm, cur_speed); + + SpeedStats[cur_stats_ind] = cur_speed; + cur_stats_ind = (cur_stats_ind + 1) % STATS_WND_SIZE; + +#ifdef DEBUG_MODE + if ((iter % 300) == 0) + { + int corr_cnt = Dbg_CheckKangs(); + if (corr_cnt) + { + printf("DBG: GPU %d, KANGS CORRUPTED: %d\r\n", CudaIndex, corr_cnt); + gTotalErrors++; + } + else + printf("DBG: GPU %d, ALL KANGS OK!\r\n", CudaIndex); + } + iter++; +#endif + } + + Release(); +} + +int RCGpuKang::GetStatsSpeed() +{ + int res = SpeedStats[0]; + for (int i = 1; i < STATS_WND_SIZE; i++) + res += SpeedStats[i]; + return res / STATS_WND_SIZE; } \ No newline at end of file diff --git a/LICENSE.TXT b/LICENSE.TXT deleted file mode 100644 index 94a9ed0..0000000 --- a/LICENSE.TXT +++ /dev/null @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/Makefile b/Makefile index 8aa0b60..8cf0beb 100644 --- a/Makefile +++ b/Makefile @@ -1,29 +1,85 @@ -CC := g++ -NVCC := /usr/local/cuda-12.0/bin/nvcc -CUDA_PATH ?= /usr/local/cuda-12.0 - -CCFLAGS := -O3 -I$(CUDA_PATH)/include -NVCCFLAGS := -O3 -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_75,code=compute_75 -gencode=arch=compute_61,code=compute_61 -LDFLAGS := -L$(CUDA_PATH)/lib64 -lcudart -pthread - -CPU_SRC := RCKangaroo.cpp GpuKang.cpp Ec.cpp utils.cpp -GPU_SRC := RCGpuCore.cu - -CPP_OBJECTS := $(CPU_SRC:.cpp=.o) -CU_OBJECTS := $(GPU_SRC:.cu=.o) - -TARGET := rckangaroo - -all: $(TARGET) - -$(TARGET): $(CPP_OBJECTS) $(CU_OBJECTS) - $(CC) $(CCFLAGS) -o $@ $^ $(LDFLAGS) - -%.o: %.cpp - $(CC) $(CCFLAGS) -c $< -o $@ - -%.o: %.cu - $(NVCC) $(NVCCFLAGS) -c $< -o $@ - -clean: - rm -f $(CPP_OBJECTS) $(CU_OBJECTS) +# RCKangaroo Makefile (robusto y autodetecta CUDA) +# Uso: +# make clean +# make SM=86 USE_JACOBIAN=1 PROFILE=release -j +# ./rckangaroo -h + +TARGET := rckangaroo + +# Toolchains +CC := g++ +NVCC := /usr/bin/nvcc + +# CUDA +CUDA_PATH ?= /usr/local/cuda-12.0 +SM ?= 86 +USE_JACOBIAN ?= 1 +PROFILE ?= release + +# Optimización separada: host vs device +HOST_COPT_release := -O3 -DNDEBUG -ffunction-sections -fdata-sections +HOST_COPT_debug := -O0 -g +HOST_COPT := $(HOST_COPT_$(PROFILE)) + +DEV_COPT_release := -O3 +DEV_COPT_debug := -O0 -g +DEV_COPT := $(DEV_COPT_$(PROFILE)) + +# Flags +CCFLAGS := -std=c++17 -I$(CUDA_PATH)/include $(HOST_COPT) -DUSE_JACOBIAN=$(USE_JACOBIAN) +NVCCFLAGS := -std=c++17 -arch=sm_$(SM) $(DEV_COPT) -Xptxas -O3 -Xptxas -dlcm=ca -Xfatbin=-compress-all -DUSE_JACOBIAN=$(USE_JACOBIAN) +NVCCXCOMP := -Xcompiler -ffunction-sections -Xcompiler -fdata-sections + +LDFLAGS := -L$(CUDA_PATH)/lib64 -lcudart -pthread + +# Fuentes +SRC_CPP := RCKangaroo.cpp GpuKang.cpp Ec.cpp utils.cpp + +# Directorio donde está el .cu (por defecto, raíz) +CU_DIR ?= . +SRC_CU := $(wildcard $(CU_DIR)/RCGpuCore.cu) + +OBJ_CPP := $(SRC_CPP:.cpp=.o) +OBJ_CU := $(patsubst %.cu,%.o,$(SRC_CU)) + +ifeq ($(strip $(OBJ_CU)),) + $(warning [Makefile] No se encontró RCGpuCore.cu en $(CU_DIR). Se construirá solo CPU.) + OBJS := $(OBJ_CPP) +else + OBJS := $(OBJ_CPP) $(OBJ_CU) +endif + +.PHONY: all clean print-vars + +all: $(TARGET) + +$(TARGET): $(OBJS) + $(CC) $(CCFLAGS) -o $@ $(OBJS) $(LDFLAGS) + +%.o: %.cpp + $(CC) $(CCFLAGS) -c $< -o $@ + +# Regla genérica CUDA (.cu -> .o) con flags host vía -Xcompiler +$(CU_DIR)/%.o: $(CU_DIR)/%.cu + $(NVCC) $(NVCCFLAGS) $(NVCCXCOMP) -c $< -o $@ + +# Regla explícita (por si tu make ignora patrones) +$(CU_DIR)/RCGpuCore.o: $(CU_DIR)/RCGpuCore.cu RCGpuUtils.h Ec.h defs.h + $(NVCC) $(NVCCFLAGS) $(NVCCXCOMP) -c $< -o $@ + +clean: + rm -f $(OBJ_CPP) $(OBJ_CU) $(TARGET) + +print-vars: + @echo "CUDA_PATH=$(CUDA_PATH)" + @echo "SM=$(SM)" + @echo "USE_JACOBIAN=$(USE_JACOBIAN)" + @echo "PROFILE=$(PROFILE)" + @echo "SRC_CPP=$(SRC_CPP)" + @echo "CU_DIR=$(CU_DIR)" + @echo "SRC_CU=$(SRC_CU)" + @echo "OBJ_CPP=$(OBJ_CPP)" + @echo "OBJ_CU=$(OBJ_CU)" + @echo "OBJS=$(OBJS)" + @echo "NVCCFLAGS=$(NVCCFLAGS)" + @echo "NVCCXCOMP=$(NVCCXCOMP)" diff --git a/RCGpuCore.cu b/RCGpuCore.cu index 4556a5c..c96a981 100644 --- a/RCGpuCore.cu +++ b/RCGpuCore.cu @@ -28,6 +28,11 @@ extern __shared__ u64 LDS[]; //this kernel performs main jumps extern "C" __launch_bounds__(BLOCK_SIZE, 1) + +// === Jacobian option (experimental) ========================================= +// Para habilitar: compilar con -DUSE_JACOBIAN=1. Mantiene el formato de salida. +// Implementa sumas J+A y conversión por lotes sólo para la verificación DP. +// ============================================================================ __global__ void KernelA(const TKparams Kparams) { u64* L2x = Kparams.L2 + 2 * THREAD_X + 4 * BLOCK_SIZE * BLOCK_X; @@ -79,6 +84,73 @@ __global__ void KernelA(const TKparams Kparams) for (int step_ind = 0; step_ind < STEP_CNT; step_ind++) { +#if USE_JACOBIAN + // [Jacobian path] — usamos L2x=L2X, L2y=L2Y, L2s=L2Z + __align__(16) u64 X[4], Y[4], Z[4]; + // 1) Cargar Z=1 si es la primera iteración + for (int group = 0; group < PNT_GROUP_CNT; group++) { + LOAD_VAL_256(X, L2x, group); + LOAD_VAL_256(Y, L2y, group); + if (step_ind==0) { Z[0]=1; Z[1]=Z[2]=Z[3]=0; } + else { LOAD_VAL_256(Z, L2s, group); } + // salto + u64* jmp_table; + __align__(16) u64 jmp_x[4], jmp_y[4]; + u16 jmp_ind_loc = X[0] % JMP_CNT; + jmp_table = ((L1S2 >> group) & 1) ? jmp2_table : jmp1_table; + Copy_int4_x2(jmp_x, jmp_table + 8 * jmp_ind_loc); + Copy_int4_x2(jmp_y, jmp_table + 8 * jmp_ind_loc + 4); + // manejo de inversión de y según bit + u32 inv_flag = (u32)Y[0] & 1u; + if (inv_flag) { jmp_ind_loc |= INV_FLAG; NegModP(jmp_y); } + u64 X3[4],Y3[4],Z3[4]; + JacobianAddMixed(X3,Y3,Z3,X,Y,Z,jmp_x,jmp_y); + SAVE_VAL_256(L2x, X3, group); + SAVE_VAL_256(L2y, Y3, group); + SAVE_VAL_256(L2s, Z3, group); + + // Conversión batched a afín sólo para DP (una inversión total por hilo) + } + // Batch inversion: Z^-1 para todos los grupos (Montgomery trick) + __align__(16) u64 prod[4] = {1,0,0,0}; + for (int group=0; group=0; group--) { + __align__(16) u64 prev[4] = {1,0,0,0}; + if (group>0) LOAD_VAL_256(prev, L2s, group-1); + LOAD_VAL_256(Z, L2s, group); + __align__(16) u64 invZ[4]; MulModP(invZ, prod, prev); // Z_i^-1 + // actualizar prod = prod * Z_i + MulModP(prod, prod, Z); + // x_aff = X * invZ^2 + __align__(16) u64 Xg[4], invZ2[4]; + LOAD_VAL_256(Xg, L2x, group); + MulModP(invZ2, invZ, invZ); + MulModP(Xg, Xg, invZ2); + // check DP + if ((Xg[3] & dp_mask64) == 0) { + u32 kang_ind = (THREAD_X + BLOCK_X * BLOCK_SIZE) * PNT_GROUP_CNT + group; + u32 ind = atomicAdd(Kparams.DPTable + kang_ind, 1); + ind = min(ind, DPTABLE_MAX_CNT - 1); + int4 *pdst = (int4*)(&Kparams.DPs_out[(u64)kang_ind * DPTABLE_MAX_CNT * 8 + 8 * ind]); + ((int4*)pdst)[0] = ((int4*)Xg)[0]; + ((int4*)pdst)[1] = ((int4*)Xg)[1]; + // acción de salto (igual al camino original) + // NOTA: reusamos jmp_ind_loc de arriba no disponible aquí; reconstituimos: + LOAD_VAL_256(X, L2x, group); + u16 ji = X[0] % JMP_CNT; + ji |= ((u32)Y[0] & 1u) ? 0 : INV_FLAG; + ((u16*)pdst)[8+0] = ji; // almacena índice salto+flags + ((u16*)pdst)[8+1] = 0; + } + } + continue; // saltamos el camino afín original +#endif // USE_JACOBIAN + __align__(16) u64 inverse[5]; u64* jmp_table; __align__(16) u64 jmp_x[4]; @@ -532,12 +604,61 @@ __device__ __forceinline__ bool ProcessJumpDistance(u32 step_ind, u32 d_cur, u64 table[iter] = d[0]; *cur_ind = (iter + 1) % MD_LEN; - if (found_ind < 0) - { - if (d_cur & DP_FLAG) - BuildDP(Kparams, kang_ind, d); - return false; - } + // --- Warp-aggregated emisión de DPs (reemplaza BuildDP) --- +if (found_ind < 0) +{ + // 1) Cada hilo decide si «quiere emitir» un DP en este paso + // (antes llamábamos a BuildDP) + bool emit = false; + int4 rx_local; // X parcial del DP (lo que lectura BuildDP hacía) + if (d_cur & DP_FLAG) { + // Replicamos la parte de BuildDP que obtiene el X guardado en DPTable + int idx = atomicAdd(Kparams.DPTable + kang_ind, 0x10000); + idx >>= 16; // índice de lectura (alto de 16 bits) + if (idx < DPTABLE_MAX_CNT) { + rx_local = *(int4*)(Kparams.DPTable + + Kparams.KangCnt + + (kang_ind * DPTABLE_MAX_CNT + idx) * 4); + emit = true; + } + } + + // 2) Warp-aggregated atomic: una sola atomicAdd por warp + const unsigned active = __activemask(); + const int lane = threadIdx.x & 31; + const int leader = __ffs(active) - 1; // primer lane activo + unsigned mask_emit = __ballot_sync(active, emit); // quiénes emiten en el warp + int hits = __popc(mask_emit); // cuántos DPs emiten + + if (hits) { + // Sólo el líder reserva 'hits' slots contiguos en la salida + unsigned base = 0; + if (lane == leader) { + base = atomicAdd(Kparams.DPs_out, (unsigned)hits); + } + // Broadcast del 'base' al warp + base = __shfl_sync(active, base, leader); + + if (emit) { + // Posición compacta de este hilo dentro del bloque reservado + unsigned laneMask = mask_emit & ((1u << lane) - 1u); + unsigned pos = __popc(laneMask); + + unsigned outIdx = base + pos; + outIdx = min(outIdx, (unsigned)(MAX_DP_CNT - 1)); + + // Escribir el registro exactamente igual que BuildDP + u32* DPs = Kparams.DPs_out + 4 + outIdx * (GPU_DP_SIZE / 4); + *(int4*)&DPs[0] = rx_local; // X parcial (int4) + *(int4*)&DPs[4] = ((int4*)d)[0]; // dist[0..1] (128 bits) + *(u64*)&DPs[8] = d[2]; // dist[2] (64 bits) + DPs[10] = 3 * kang_ind / Kparams.KangCnt; // tipo de kanguro + } + } + + return false; +} + u32 LoopSize = (iter + MD_LEN - found_ind) % MD_LEN; if (!LoopSize) diff --git a/RCGpuUtils.h b/RCGpuUtils.h index 6c5db83..8bf4e2e 100644 --- a/RCGpuUtils.h +++ b/RCGpuUtils.h @@ -519,7 +519,7 @@ __device__ __forceinline__ void InvModP(u32* res) __align__(8) u32 modp[9]; __align__(8) u32 val[9]; __align__(8) u32 a[9]; - __align__(8) u32 tmp[4][9+1]; //+1 because we need alignment 64bit for tmp[>0] + __align__(8) u32 tmp[4][9]; ((u64*)modp)[0] = P_0; ((u64*)modp)[1] = P_123; @@ -628,3 +628,53 @@ __device__ __forceinline__ void InvModP(u32* res) sub_288_P(res); } +// === Extensiones: utilidades Jacobianas (GPU) ================================ +__device__ __forceinline__ void SquareModP(u64 *res, u64 *a) { + MulModP(res, a, a); +} + +// r = 2*p (doblado jacobiano, a=0) +__device__ __forceinline__ void JacobianDouble(u64 *X3,u64 *Y3,u64 *Z3,const u64 *X1,const u64 *Y1,const u64 *Z1){ + u64 XX[4],YY[4],YYYY[4],S[4],M[4],T[4],twoS[4],Z3tmp[4]; + Copy_u64_x4(XX,(void*)X1); MulModP(XX,XX,XX); + Copy_u64_x4(YY,(void*)Y1); MulModP(YY,YY,YY); + Copy_u64_x4(YYYY,YY); MulModP(YYYY,YYYY,YY); + Copy_u64_x4(S,(void*)X1); MulModP(S,S,YY); AddModP(S,S,S); // 2*X1*Y1^2 + Copy_u64_x4(M,XX); AddModP(M,M,XX); AddModP(M,M,XX); // 3*X1^2 + Copy_u64_x4(T,M); MulModP(T,T,M); // M^2 + Copy_u64_x4(twoS,S); AddModP(twoS,twoS,S); // 2*S + SubModP((u64*)X3,T,twoS); // X3 + u64 V[4]; Copy_u64_x4(V,S); SubModP(V,V,(u64*)X3); MulModP(V,V,M); + u64 eightYYYY[4]; Copy_u64_x4(eightYYYY,YYYY); + for (int i=0;i<3;i++) AddModP(eightYYYY,eightYYYY,YYYY); // *8 + SubModP((u64*)Y3,V,eightYYYY); + Copy_u64_x4(Z3tmp,(void*)Y1); AddModP(Z3tmp,Z3tmp,(u64*)Y1); MulModP(Z3tmp,Z3tmp,(u64*)Z1); + Copy_u64_x4((u64*)Z3,Z3tmp); +} + +// r = p + q_aff (mixta) +__device__ __forceinline__ void JacobianAddMixed(u64 *X3,u64 *Y3,u64 *Z3,const u64 *X1,const u64 *Y1,const u64 *Z1,const u64 *Qx,const u64 *Qy){ + u64 Z2[4],Z3_[4],U2[4],S2[4],H[4],R[4]; + Copy_u64_x4(Z2,(void*)Z1); MulModP(Z2,Z2,Z2); + Copy_u64_x4(Z3_,Z2); MulModP(Z3_,Z3_,(u64*)Z1); + Copy_u64_x4(U2,(void*)Qx); MulModP(U2,U2,Z2); + Copy_u64_x4(S2,(void*)Qy); MulModP(S2,S2,Z3_); + Copy_u64_x4(H,U2); SubModP(H,H,(u64*)X1); + Copy_u64_x4(R,S2); SubModP(R,R,(u64*)Y1); + // if H==0 -> degenerate (omit; improbable en saltos aleatorios) + u64 HH[4],HHH[4],V[4],X3_[4],Y3_[4],Z3tmp[4]; + Copy_u64_x4(HH,H); MulModP(HH,HH,H); + Copy_u64_x4(HHH,HH);MulModP(HHH,HHH,H); + Copy_u64_x4(V,(void*)X1); MulModP(V,V,HH); + Copy_u64_x4(X3_,R); MulModP(X3_,X3_,R); + SubModP(X3_,X3_,HHH); + u64 twoV[4]; Copy_u64_x4(twoV,V); AddModP(twoV,twoV,V); + SubModP((u64*)X3,X3_,twoV); + Copy_u64_x4(Y3_,V); SubModP(Y3_,Y3_,(u64*)X3); MulModP(Y3_,Y3_,R); + u64 S1HHH[4]; Copy_u64_x4(S1HHH,(void*)Y1); MulModP(S1HHH,S1HHH,HHH); + SubModP((u64*)Y3,Y3_,S1HHH); + Copy_u64_x4(Z3tmp,(void*)Z1); MulModP(Z3tmp,Z3tmp,H); + Copy_u64_x4((u64*)Z3,Z3tmp); +} +// ============================================================================ + diff --git a/RCKangaroo.cpp b/RCKangaroo.cpp index 19c0fae..605c470 100644 --- a/RCKangaroo.cpp +++ b/RCKangaroo.cpp @@ -1,764 +1,789 @@ -// This file is a part of RCKangaroo software -// (c) 2024, RetiredCoder (RC) -// License: GPLv3, see "LICENSE.TXT" file -// https://github.com/RetiredC - - -#include -#include - -#include "cuda_runtime.h" -#include "cuda.h" - -#include "defs.h" -#include "utils.h" -#include "GpuKang.h" - - -EcJMP EcJumps1[JMP_CNT]; -EcJMP EcJumps2[JMP_CNT]; -EcJMP EcJumps3[JMP_CNT]; - -RCGpuKang* GpuKangs[MAX_GPU_CNT]; -int GpuCnt; -volatile long ThrCnt; -volatile bool gSolved; - -EcInt Int_HalfRange; -EcPoint Pnt_HalfRange; -EcPoint Pnt_NegHalfRange; -EcInt Int_TameOffset; -Ec ec; - -CriticalSection csAddPoints; -u8* pPntList; -u8* pPntList2; -volatile int PntIndex; -TFastBase db; -EcPoint gPntToSolve; -EcInt gPrivKey; - -volatile u64 TotalOps; -u32 TotalSolved; -u32 gTotalErrors; -u64 PntTotalOps; -bool IsBench; - -u32 gDP; -u32 gRange; -EcInt gStart; -bool gStartSet; -EcPoint gPubKey; -u8 gGPUs_Mask[MAX_GPU_CNT]; -char gTamesFileName[1024]; -double gMax; -bool gGenMode; //tames generation mode -bool gIsOpsLimit; - -#pragma pack(push, 1) -struct DBRec -{ - u8 x[12]; - u8 d[22]; - u8 type; //0 - tame, 1 - wild1, 2 - wild2 -}; -#pragma pack(pop) - -void InitGpus() -{ - GpuCnt = 0; - int gcnt = 0; - cudaGetDeviceCount(&gcnt); - if (gcnt > MAX_GPU_CNT) - gcnt = MAX_GPU_CNT; - -// gcnt = 1; //dbg - if (!gcnt) - return; - - int drv, rt; - cudaRuntimeGetVersion(&rt); - cudaDriverGetVersion(&drv); - char drvver[100]; - sprintf(drvver, "%d.%d/%d.%d", drv / 1000, (drv % 100) / 10, rt / 1000, (rt % 100) / 10); - - printf("CUDA devices: %d, CUDA driver/runtime: %s\r\n", gcnt, drvver); - cudaError_t cudaStatus; - for (int i = 0; i < gcnt; i++) - { - cudaStatus = cudaSetDevice(i); - if (cudaStatus != cudaSuccess) - { - printf("cudaSetDevice for gpu %d failed!\r\n", i); - continue; - } - - if (!gGPUs_Mask[i]) - continue; - - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, i); - printf("GPU %d: %s, %.2f GB, %d CUs, cap %d.%d, PCI %d, L2 size: %d KB\r\n", i, deviceProp.name, ((float)(deviceProp.totalGlobalMem / (1024 * 1024))) / 1024.0f, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor, deviceProp.pciBusID, deviceProp.l2CacheSize / 1024); - - if (deviceProp.major < 6) - { - printf("GPU %d - not supported, skip\r\n", i); - continue; - } - - cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); - - GpuKangs[GpuCnt] = new RCGpuKang(); - GpuKangs[GpuCnt]->CudaIndex = i; - GpuKangs[GpuCnt]->persistingL2CacheMaxSize = deviceProp.persistingL2CacheMaxSize; - GpuKangs[GpuCnt]->mpCnt = deviceProp.multiProcessorCount; - GpuKangs[GpuCnt]->IsOldGpu = deviceProp.l2CacheSize < 16 * 1024 * 1024; - GpuCnt++; - } - printf("Total GPUs for work: %d\r\n", GpuCnt); -} -#ifdef _WIN32 -u32 __stdcall kang_thr_proc(void* data) -{ - RCGpuKang* Kang = (RCGpuKang*)data; - Kang->Execute(); - InterlockedDecrement(&ThrCnt); - return 0; -} -#else -void* kang_thr_proc(void* data) -{ - RCGpuKang* Kang = (RCGpuKang*)data; - Kang->Execute(); - __sync_fetch_and_sub(&ThrCnt, 1); - return 0; -} -#endif -void AddPointsToList(u32* data, int pnt_cnt, u64 ops_cnt) -{ - csAddPoints.Enter(); - if (PntIndex + pnt_cnt >= MAX_CNT_LIST) - { - csAddPoints.Leave(); - printf("DPs buffer overflow, some points lost, increase DP value!\r\n"); - return; - } - memcpy(pPntList + GPU_DP_SIZE * PntIndex, data, pnt_cnt * GPU_DP_SIZE); - PntIndex += pnt_cnt; - PntTotalOps += ops_cnt; - csAddPoints.Leave(); -} - -bool Collision_SOTA(EcPoint& pnt, EcInt t, int TameType, EcInt w, int WildType, bool IsNeg) -{ - if (IsNeg) - t.Neg(); - if (TameType == TAME) - { - gPrivKey = t; - gPrivKey.Sub(w); - EcInt sv = gPrivKey; - gPrivKey.Add(Int_HalfRange); - EcPoint P = ec.MultiplyG(gPrivKey); - if (P.IsEqual(pnt)) - return true; - gPrivKey = sv; - gPrivKey.Neg(); - gPrivKey.Add(Int_HalfRange); - P = ec.MultiplyG(gPrivKey); - return P.IsEqual(pnt); - } - else - { - gPrivKey = t; - gPrivKey.Sub(w); - if (gPrivKey.data[4] >> 63) - gPrivKey.Neg(); - gPrivKey.ShiftRight(1); - EcInt sv = gPrivKey; - gPrivKey.Add(Int_HalfRange); - EcPoint P = ec.MultiplyG(gPrivKey); - if (P.IsEqual(pnt)) - return true; - gPrivKey = sv; - gPrivKey.Neg(); - gPrivKey.Add(Int_HalfRange); - P = ec.MultiplyG(gPrivKey); - return P.IsEqual(pnt); - } -} - - -void CheckNewPoints() -{ - csAddPoints.Enter(); - if (!PntIndex) - { - csAddPoints.Leave(); - return; - } - - int cnt = PntIndex; - memcpy(pPntList2, pPntList, GPU_DP_SIZE * cnt); - PntIndex = 0; - csAddPoints.Leave(); - - for (int i = 0; i < cnt; i++) - { - DBRec nrec; - u8* p = pPntList2 + i * GPU_DP_SIZE; - memcpy(nrec.x, p, 12); - memcpy(nrec.d, p + 16, 22); - nrec.type = gGenMode ? TAME : p[40]; - - DBRec* pref = (DBRec*)db.FindOrAddDataBlock((u8*)&nrec); - if (gGenMode) - continue; - if (pref) - { - //in db we dont store first 3 bytes so restore them - DBRec tmp_pref; - memcpy(&tmp_pref, &nrec, 3); - memcpy(((u8*)&tmp_pref) + 3, pref, sizeof(DBRec) - 3); - pref = &tmp_pref; - - if (pref->type == nrec.type) - { - if (pref->type == TAME) - continue; - - //if it's wild, we can find the key from the same type if distances are different - if (*(u64*)pref->d == *(u64*)nrec.d) - continue; - //else - // ToLog("key found by same wild"); - } - - EcInt w, t; - int TameType, WildType; - if (pref->type != TAME) - { - memcpy(w.data, pref->d, sizeof(pref->d)); - if (pref->d[21] == 0xFF) memset(((u8*)w.data) + 22, 0xFF, 18); - memcpy(t.data, nrec.d, sizeof(nrec.d)); - if (nrec.d[21] == 0xFF) memset(((u8*)t.data) + 22, 0xFF, 18); - TameType = nrec.type; - WildType = pref->type; - } - else - { - memcpy(w.data, nrec.d, sizeof(nrec.d)); - if (nrec.d[21] == 0xFF) memset(((u8*)w.data) + 22, 0xFF, 18); - memcpy(t.data, pref->d, sizeof(pref->d)); - if (pref->d[21] == 0xFF) memset(((u8*)t.data) + 22, 0xFF, 18); - TameType = TAME; - WildType = nrec.type; - } - - bool res = Collision_SOTA(gPntToSolve, t, TameType, w, WildType, false) || Collision_SOTA(gPntToSolve, t, TameType, w, WildType, true); - if (!res) - { - bool w12 = ((pref->type == WILD1) && (nrec.type == WILD2)) || ((pref->type == WILD2) && (nrec.type == WILD1)); - if (w12) //in rare cases WILD and WILD2 can collide in mirror, in this case there is no way to find K - ;// ToLog("W1 and W2 collides in mirror"); - else - { - printf("Collision Error\r\n"); - gTotalErrors++; - } - continue; - } - gSolved = true; - break; - } - } -} - -void ShowStats(u64 tm_start, double exp_ops, double dp_val) -{ -#ifdef DEBUG_MODE - for (int i = 0; i <= MD_LEN; i++) - { - u64 val = 0; - for (int j = 0; j < GpuCnt; j++) - { - val += GpuKangs[j]->dbg[i]; - } - if (val) - printf("Loop size %d: %llu\r\n", i, val); - } -#endif - - int speed = GpuKangs[0]->GetStatsSpeed(); - for (int i = 1; i < GpuCnt; i++) - speed += GpuKangs[i]->GetStatsSpeed(); - - u64 est_dps_cnt = (u64)(exp_ops / dp_val); - u64 exp_sec = 0xFFFFFFFFFFFFFFFFull; - if (speed) - exp_sec = (u64)((exp_ops / 1000000) / speed); //in sec - u64 exp_days = exp_sec / (3600 * 24); - int exp_hours = (int)(exp_sec - exp_days * (3600 * 24)) / 3600; - int exp_min = (int)(exp_sec - exp_days * (3600 * 24) - exp_hours * 3600) / 60; - - u64 sec = (GetTickCount64() - tm_start) / 1000; - u64 days = sec / (3600 * 24); - int hours = (int)(sec - days * (3600 * 24)) / 3600; - int min = (int)(sec - days * (3600 * 24) - hours * 3600) / 60; - - printf("%sSpeed: %d MKeys/s, Err: %d, DPs: %lluK/%lluK, Time: %llud:%02dh:%02dm/%llud:%02dh:%02dm\r\n", gGenMode ? "GEN: " : (IsBench ? "BENCH: " : "MAIN: "), speed, gTotalErrors, db.GetBlockCnt()/1000, est_dps_cnt/1000, days, hours, min, exp_days, exp_hours, exp_min); -} - -bool SolvePoint(EcPoint PntToSolve, int Range, int DP, EcInt* pk_res) -{ - if ((Range < 32) || (Range > 180)) - { - printf("Unsupported Range value (%d)!\r\n", Range); - return false; - } - if ((DP < 14) || (DP > 60)) - { - printf("Unsupported DP value (%d)!\r\n", DP); - return false; - } - - printf("\r\nSolving point: Range %d bits, DP %d, start...\r\n", Range, DP); - double ops = 1.15 * pow(2.0, Range / 2.0); - double dp_val = (double)(1ull << DP); - double ram = (32 + 4 + 4) * ops / dp_val; //+4 for grow allocation and memory fragmentation - ram += sizeof(TListRec) * 256 * 256 * 256; //3byte-prefix table - ram /= (1024 * 1024 * 1024); //GB - printf("SOTA method, estimated ops: 2^%.3f, RAM for DPs: %.3f GB. DP and GPU overheads not included!\r\n", log2(ops), ram); - gIsOpsLimit = false; - double MaxTotalOps = 0.0; - if (gMax > 0) - { - MaxTotalOps = gMax * ops; - double ram_max = (32 + 4 + 4) * MaxTotalOps / dp_val; //+4 for grow allocation and memory fragmentation - ram_max += sizeof(TListRec) * 256 * 256 * 256; //3byte-prefix table - ram_max /= (1024 * 1024 * 1024); //GB - printf("Max allowed number of ops: 2^%.3f, max RAM for DPs: %.3f GB\r\n", log2(MaxTotalOps), ram_max); - } - - u64 total_kangs = GpuKangs[0]->CalcKangCnt(); - for (int i = 1; i < GpuCnt; i++) - total_kangs += GpuKangs[i]->CalcKangCnt(); - double path_single_kang = ops / total_kangs; - double DPs_per_kang = path_single_kang / dp_val; - printf("Estimated DPs per kangaroo: %.3f.%s\r\n", DPs_per_kang, (DPs_per_kang < 5) ? " DP overhead is big, use less DP value if possible!" : ""); - - if (!gGenMode && gTamesFileName[0]) - { - printf("load tames...\r\n"); - if (db.LoadFromFile(gTamesFileName)) - { - printf("tames loaded\r\n"); - if (db.Header[0] != gRange) - { - printf("loaded tames have different range, they cannot be used, clear\r\n"); - db.Clear(); - } - } - else - printf("tames loading failed\r\n"); - } - - SetRndSeed(0); //use same seed to make tames from file compatible - PntTotalOps = 0; - PntIndex = 0; -//prepare jumps - EcInt minjump, t; - minjump.Set(1); - minjump.ShiftLeft(Range / 2 + 3); - for (int i = 0; i < JMP_CNT; i++) - { - EcJumps1[i].dist = minjump; - t.RndMax(minjump); - EcJumps1[i].dist.Add(t); - EcJumps1[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even - EcJumps1[i].p = ec.MultiplyG(EcJumps1[i].dist); - } - - minjump.Set(1); - minjump.ShiftLeft(Range - 10); //large jumps for L1S2 loops. Must be almost RANGE_BITS - for (int i = 0; i < JMP_CNT; i++) - { - EcJumps2[i].dist = minjump; - t.RndMax(minjump); - EcJumps2[i].dist.Add(t); - EcJumps2[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even - EcJumps2[i].p = ec.MultiplyG(EcJumps2[i].dist); - } - - minjump.Set(1); - minjump.ShiftLeft(Range - 10 - 2); //large jumps for loops >2 - for (int i = 0; i < JMP_CNT; i++) - { - EcJumps3[i].dist = minjump; - t.RndMax(minjump); - EcJumps3[i].dist.Add(t); - EcJumps3[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even - EcJumps3[i].p = ec.MultiplyG(EcJumps3[i].dist); - } - SetRndSeed(GetTickCount64()); - - Int_HalfRange.Set(1); - Int_HalfRange.ShiftLeft(Range - 1); - Pnt_HalfRange = ec.MultiplyG(Int_HalfRange); - Pnt_NegHalfRange = Pnt_HalfRange; - Pnt_NegHalfRange.y.NegModP(); - Int_TameOffset.Set(1); - Int_TameOffset.ShiftLeft(Range - 1); - EcInt tt; - tt.Set(1); - tt.ShiftLeft(Range - 5); //half of tame range width - Int_TameOffset.Sub(tt); - gPntToSolve = PntToSolve; - -//prepare GPUs - for (int i = 0; i < GpuCnt; i++) - if (!GpuKangs[i]->Prepare(PntToSolve, Range, DP, EcJumps1, EcJumps2, EcJumps3)) - { - GpuKangs[i]->Failed = true; - printf("GPU %d Prepare failed\r\n", GpuKangs[i]->CudaIndex); - } - - u64 tm0 = GetTickCount64(); - printf("GPUs started...\r\n"); - -#ifdef _WIN32 - HANDLE thr_handles[MAX_GPU_CNT]; -#else - pthread_t thr_handles[MAX_GPU_CNT]; -#endif - - u32 ThreadID; - gSolved = false; - ThrCnt = GpuCnt; - for (int i = 0; i < GpuCnt; i++) - { -#ifdef _WIN32 - thr_handles[i] = (HANDLE)_beginthreadex(NULL, 0, kang_thr_proc, (void*)GpuKangs[i], 0, &ThreadID); -#else - pthread_create(&thr_handles[i], NULL, kang_thr_proc, (void*)GpuKangs[i]); -#endif - } - - u64 tm_stats = GetTickCount64(); - while (!gSolved) - { - CheckNewPoints(); - Sleep(10); - if (GetTickCount64() - tm_stats > 10 * 1000) - { - ShowStats(tm0, ops, dp_val); - tm_stats = GetTickCount64(); - } - - if ((MaxTotalOps > 0.0) && (PntTotalOps > MaxTotalOps)) - { - gIsOpsLimit = true; - printf("Operations limit reached\r\n"); - break; - } - } - - printf("Stopping work ...\r\n"); - for (int i = 0; i < GpuCnt; i++) - GpuKangs[i]->Stop(); - while (ThrCnt) - Sleep(10); - for (int i = 0; i < GpuCnt; i++) - { -#ifdef _WIN32 - CloseHandle(thr_handles[i]); -#else - pthread_join(thr_handles[i], NULL); -#endif - } - - if (gIsOpsLimit) - { - if (gGenMode) - { - printf("saving tames...\r\n"); - db.Header[0] = gRange; - if (db.SaveToFile(gTamesFileName)) - printf("tames saved\r\n"); - else - printf("tames saving failed\r\n"); - } - db.Clear(); - return false; - } - - double K = (double)PntTotalOps / pow(2.0, Range / 2.0); - printf("Point solved, K: %.3f (with DP and GPU overheads)\r\n\r\n", K); - db.Clear(); - *pk_res = gPrivKey; - return true; -} - -bool ParseCommandLine(int argc, char* argv[]) -{ - int ci = 1; - while (ci < argc) - { - char* argument = argv[ci]; - ci++; - if (strcmp(argument, "-gpu") == 0) - { - if (ci >= argc) - { - printf("error: missed value after -gpu option\r\n"); - return false; - } - char* gpus = argv[ci]; - ci++; - memset(gGPUs_Mask, 0, sizeof(gGPUs_Mask)); - for (int i = 0; i < (int)strlen(gpus); i++) - { - if ((gpus[i] < '0') || (gpus[i] > '9')) - { - printf("error: invalid value for -gpu option\r\n"); - return false; - } - gGPUs_Mask[gpus[i] - '0'] = 1; - } - } - else - if (strcmp(argument, "-dp") == 0) - { - int val = atoi(argv[ci]); - ci++; - if ((val < 14) || (val > 60)) - { - printf("error: invalid value for -dp option\r\n"); - return false; - } - gDP = val; - } - else - if (strcmp(argument, "-range") == 0) - { - int val = atoi(argv[ci]); - ci++; - if ((val < 32) || (val > 170)) - { - printf("error: invalid value for -range option\r\n"); - return false; - } - gRange = val; - } - else - if (strcmp(argument, "-start") == 0) - { - if (!gStart.SetHexStr(argv[ci])) - { - printf("error: invalid value for -start option\r\n"); - return false; - } - ci++; - gStartSet = true; - } - else - if (strcmp(argument, "-pubkey") == 0) - { - if (!gPubKey.SetHexStr(argv[ci])) - { - printf("error: invalid value for -pubkey option\r\n"); - return false; - } - ci++; - } - else - if (strcmp(argument, "-tames") == 0) - { - strcpy(gTamesFileName, argv[ci]); - ci++; - } - else - if (strcmp(argument, "-max") == 0) - { - double val = atof(argv[ci]); - ci++; - if (val < 0.001) - { - printf("error: invalid value for -max option\r\n"); - return false; - } - gMax = val; - } - else - { - printf("error: unknown option %s\r\n", argument); - return false; - } - } - if (!gPubKey.x.IsZero()) - if (!gStartSet || !gRange || !gDP) - { - printf("error: you must also specify -dp, -range and -start options\r\n"); - return false; - } - if (gTamesFileName[0] && !IsFileExist(gTamesFileName)) - { - if (gMax == 0.0) - { - printf("error: you must also specify -max option to generate tames\r\n"); - return false; - } - gGenMode = true; - } - return true; -} - -int main(int argc, char* argv[]) -{ -#ifdef _DEBUG - _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); -#endif - - printf("********************************************************************************\r\n"); - printf("* RCKangaroo v3.0 (c) 2024 RetiredCoder *\r\n"); - printf("********************************************************************************\r\n\r\n"); - - printf("This software is free and open-source: https://github.com/RetiredC\r\n"); - printf("It demonstrates fast GPU implementation of SOTA Kangaroo method for solving ECDLP\r\n"); - -#ifdef _WIN32 - printf("Windows version\r\n"); -#else - printf("Linux version\r\n"); -#endif - -#ifdef DEBUG_MODE - printf("DEBUG MODE\r\n\r\n"); -#endif - - InitEc(); - gDP = 0; - gRange = 0; - gStartSet = false; - gTamesFileName[0] = 0; - gMax = 0.0; - gGenMode = false; - gIsOpsLimit = false; - memset(gGPUs_Mask, 1, sizeof(gGPUs_Mask)); - if (!ParseCommandLine(argc, argv)) - return 0; - - InitGpus(); - - if (!GpuCnt) - { - printf("No supported GPUs detected, exit\r\n"); - return 0; - } - - pPntList = (u8*)malloc(MAX_CNT_LIST * GPU_DP_SIZE); - pPntList2 = (u8*)malloc(MAX_CNT_LIST * GPU_DP_SIZE); - TotalOps = 0; - TotalSolved = 0; - gTotalErrors = 0; - IsBench = gPubKey.x.IsZero(); - - if (!IsBench && !gGenMode) - { - printf("\r\nMAIN MODE\r\n\r\n"); - EcPoint PntToSolve, PntOfs; - EcInt pk, pk_found; - - PntToSolve = gPubKey; - if (!gStart.IsZero()) - { - PntOfs = ec.MultiplyG(gStart); - PntOfs.y.NegModP(); - PntToSolve = ec.AddPoints(PntToSolve, PntOfs); - } - - char sx[100], sy[100]; - gPubKey.x.GetHexStr(sx); - gPubKey.y.GetHexStr(sy); - printf("Solving public key\r\nX: %s\r\nY: %s\r\n", sx, sy); - gStart.GetHexStr(sx); - printf("Offset: %s\r\n", sx); - - if (!SolvePoint(PntToSolve, gRange, gDP, &pk_found)) - { - if (!gIsOpsLimit) - printf("FATAL ERROR: SolvePoint failed\r\n"); - goto label_end; - } - pk_found.AddModP(gStart); - EcPoint tmp = ec.MultiplyG(pk_found); - if (!tmp.IsEqual(gPubKey)) - { - printf("FATAL ERROR: SolvePoint found incorrect key\r\n"); - goto label_end; - } - //happy end - char s[100]; - pk_found.GetHexStr(s); - printf("\r\nPRIVATE KEY: %s\r\n\r\n", s); - FILE* fp = fopen("RESULTS.TXT", "a"); - if (fp) - { - fprintf(fp, "PRIVATE KEY: %s\n", s); - fclose(fp); - } - else //we cannot save the key, show error and wait forever so the key is displayed - { - printf("WARNING: Cannot save the key to RESULTS.TXT!\r\n"); - while (1) - Sleep(100); - } - } - else - { - if (gGenMode) - printf("\r\nTAMES GENERATION MODE\r\n"); - else - printf("\r\nBENCHMARK MODE\r\n"); - //solve points, show K - while (1) - { - EcInt pk, pk_found; - EcPoint PntToSolve; - - if (!gRange) - gRange = 78; - if (!gDP) - gDP = 16; - - //generate random pk - pk.RndBits(gRange); - PntToSolve = ec.MultiplyG(pk); - - if (!SolvePoint(PntToSolve, gRange, gDP, &pk_found)) - { - if (!gIsOpsLimit) - printf("FATAL ERROR: SolvePoint failed\r\n"); - break; - } - if (!pk_found.IsEqual(pk)) - { - printf("FATAL ERROR: Found key is wrong!\r\n"); - break; - } - TotalOps += PntTotalOps; - TotalSolved++; - u64 ops_per_pnt = TotalOps / TotalSolved; - double K = (double)ops_per_pnt / pow(2.0, gRange / 2.0); - printf("Points solved: %d, average K: %.3f (with DP and GPU overheads)\r\n", TotalSolved, K); - //if (TotalSolved >= 100) break; //dbg - } - } -label_end: - for (int i = 0; i < GpuCnt; i++) - delete GpuKangs[i]; - DeInitEc(); - free(pPntList2); - free(pPntList); -} - +// This file is a part of RCKangaroo software +// (c) 2024, RetiredCoder (RC) +// License: GPLv3, see "LICENSE.TXT" file +// https://github.com/RetiredC + + +#include +#include + +#include + +#include "cuda_runtime.h" +#include "cuda.h" + +#include "defs.h" +#include "utils.h" +#ifndef DB_REC_LEN +#define DB_REC_LEN 32 +#endif +#include "GpuKang.h" + + +EcJMP EcJumps1[JMP_CNT]; +EcJMP EcJumps2[JMP_CNT]; +EcJMP EcJumps3[JMP_CNT]; + +RCGpuKang* GpuKangs[MAX_GPU_CNT]; +int GpuCnt; +volatile long ThrCnt; +volatile bool gSolved; + +EcInt Int_HalfRange; +EcPoint Pnt_HalfRange; +EcPoint Pnt_NegHalfRange; +EcInt Int_TameOffset; +Ec ec; + +CriticalSection csAddPoints; +u8* pPntList; +u8* pPntList2; +volatile int PntIndex; +TFastBase db; +EcPoint gPntToSolve; +EcInt gPrivKey; + +volatile u64 TotalOps; +u32 TotalSolved; +u32 gTotalErrors; +u64 PntTotalOps; +bool IsBench; + +u32 gDP; +u32 gRange; +EcInt gStart; +bool gStartSet; +EcPoint gPubKey; +u8 gGPUs_Mask[MAX_GPU_CNT]; +char gTamesFileName[1024]; +int gTameRatioPct = 33; +int gTameBitsOffset = 4; +double gMax; +bool gGenMode; //tames generation mode +bool gIsOpsLimit; + +#pragma pack(push, 1) +struct DBRec +{ + u8 x[12]; + u8 d[22]; + u8 type; //0 - tame, 1 - wild1, 2 - wild2 +}; +#pragma pack(pop) + +void InitGpus() +{ + GpuCnt = 0; + int gcnt = 0; + cudaGetDeviceCount(&gcnt); + if (gcnt > MAX_GPU_CNT) + gcnt = MAX_GPU_CNT; + +// gcnt = 1; //dbg + if (!gcnt) + return; + + int drv, rt; + cudaRuntimeGetVersion(&rt); + cudaDriverGetVersion(&drv); + char drvver[100]; + sprintf(drvver, "%d.%d/%d.%d", drv / 1000, (drv % 100) / 10, rt / 1000, (rt % 100) / 10); + + printf("CUDA devices: %d, CUDA driver/runtime: %s\r\n", gcnt, drvver); + cudaError_t cudaStatus; + for (int i = 0; i < gcnt; i++) + { + cudaStatus = cudaSetDevice(i); + if (cudaStatus != cudaSuccess) + { + printf("cudaSetDevice for gpu %d failed!\r\n", i); + continue; + } + + if (!gGPUs_Mask[i]) + continue; + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, i); + printf("GPU %d: %s, %.2f GB, %d CUs, cap %d.%d, PCI %d, L2 size: %d KB\r\n", i, deviceProp.name, ((float)(deviceProp.totalGlobalMem / (1024 * 1024))) / 1024.0f, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor, deviceProp.pciBusID, deviceProp.l2CacheSize / 1024); + + if (deviceProp.major < 6) + { + printf("GPU %d - not supported, skip\r\n", i); + continue; + } + + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + + GpuKangs[GpuCnt] = new RCGpuKang(); + GpuKangs[GpuCnt]->CudaIndex = i; + GpuKangs[GpuCnt]->persistingL2CacheMaxSize = deviceProp.persistingL2CacheMaxSize; + GpuKangs[GpuCnt]->mpCnt = deviceProp.multiProcessorCount; + GpuKangs[GpuCnt]->IsOldGpu = true; + GpuCnt++; + } + printf("Total GPUs for work: %d\r\n", GpuCnt); +} +#ifdef _WIN32 +u32 __stdcall kang_thr_proc(void* data) +{ + RCGpuKang* Kang = (RCGpuKang*)data; + Kang->Execute(); + InterlockedDecrement(&ThrCnt); + return 0; +} +#else +void* kang_thr_proc(void* data) +{ + RCGpuKang* Kang = (RCGpuKang*)data; + Kang->Execute(); + __sync_fetch_and_sub(&ThrCnt, 1); + return 0; +} +#endif +void AddPointsToList(u32* data, int pnt_cnt, u64 ops_cnt) +{ + csAddPoints.Enter(); + if (PntIndex + pnt_cnt >= MAX_CNT_LIST) + { + csAddPoints.Leave(); + printf("DPs buffer overflow, some points lost, increase DP value!\r\n"); + return; + } + memcpy(pPntList + GPU_DP_SIZE * PntIndex, data, pnt_cnt * GPU_DP_SIZE); + PntIndex += pnt_cnt; + PntTotalOps += ops_cnt; + csAddPoints.Leave(); +} + +bool Collision_SOTA(EcPoint& pnt, EcInt t, int TameType, EcInt w, int WildType, bool IsNeg) +{ + if (IsNeg) + t.Neg(); + if (TameType == TAME) + { + gPrivKey = t; + gPrivKey.Sub(w); + EcInt sv = gPrivKey; + gPrivKey.Add(Int_HalfRange); + EcPoint P = ec.MultiplyG(gPrivKey); + if (P.IsEqual(pnt)) + return true; + gPrivKey = sv; + gPrivKey.Neg(); + gPrivKey.Add(Int_HalfRange); + P = ec.MultiplyG(gPrivKey); + return P.IsEqual(pnt); + } + else + { + gPrivKey = t; + gPrivKey.Sub(w); + if (gPrivKey.data[4] >> 63) + gPrivKey.Neg(); + gPrivKey.ShiftRight(1); + EcInt sv = gPrivKey; + gPrivKey.Add(Int_HalfRange); + EcPoint P = ec.MultiplyG(gPrivKey); + if (P.IsEqual(pnt)) + return true; + gPrivKey = sv; + gPrivKey.Neg(); + gPrivKey.Add(Int_HalfRange); + P = ec.MultiplyG(gPrivKey); + return P.IsEqual(pnt); + } +} + + +void CheckNewPoints() +{ + csAddPoints.Enter(); + if (!PntIndex) + { + csAddPoints.Leave(); + return; + } + + int cnt = PntIndex; + memcpy(pPntList2, pPntList, GPU_DP_SIZE * cnt); + PntIndex = 0; + csAddPoints.Leave(); + + for (int i = 0; i < cnt; i++) + { + DBRec nrec; + u8* p = pPntList2 + i * GPU_DP_SIZE; + memcpy(nrec.x, p, 12); + memcpy(nrec.d, p + 16, 22); + nrec.type = gGenMode ? TAME : p[40]; + + DBRec* pref = (DBRec*)db.FindOrAddDataBlock((u8*)&nrec); + if (gGenMode) + continue; + if (pref) + { + //in db we dont store first 3 bytes so restore them + DBRec tmp_pref; + memcpy(&tmp_pref, &nrec, 3); + size_t __rec_tail = (size_t)DB_REC_LEN; + size_t __exp_tail = (size_t)sizeof(DBRec) - 3; + size_t __cpy = __rec_tail < __exp_tail ? __rec_tail : __exp_tail; + memcpy(((u8*)&tmp_pref) + 3, pref, __cpy); + if (__cpy < __exp_tail) memset(((u8*)&tmp_pref) + 3 + __cpy, 0, __exp_tail - __cpy); + pref = &tmp_pref; + + if (pref->type == nrec.type) + { + if (pref->type == TAME) + continue; + + //if it's wild, we can find the key from the same type if distances are different + if (*(u64*)pref->d == *(u64*)nrec.d) + continue; + //else + // ToLog("key found by same wild"); + } + + EcInt w, t; + int TameType, WildType; + if (pref->type != TAME) + { + memcpy(w.data, pref->d, sizeof(pref->d)); + if (pref->d[21] == 0xFF) memset(((u8*)w.data) + 22, 0xFF, 18); + memcpy(t.data, nrec.d, sizeof(nrec.d)); + if (nrec.d[21] == 0xFF) memset(((u8*)t.data) + 22, 0xFF, 18); + TameType = nrec.type; + WildType = pref->type; + } + else + { + memcpy(w.data, nrec.d, sizeof(nrec.d)); + if (nrec.d[21] == 0xFF) memset(((u8*)w.data) + 22, 0xFF, 18); + memcpy(t.data, pref->d, sizeof(pref->d)); + if (pref->d[21] == 0xFF) memset(((u8*)t.data) + 22, 0xFF, 18); + TameType = TAME; + WildType = nrec.type; + } + + bool res = Collision_SOTA(gPntToSolve, t, TameType, w, WildType, false) || Collision_SOTA(gPntToSolve, t, TameType, w, WildType, true); + if (!res) + { + bool w12 = ((pref->type == WILD1) && (nrec.type == WILD2)) || ((pref->type == WILD2) && (nrec.type == WILD1)); + if (w12) //in rare cases WILD and WILD2 can collide in mirror, in this case there is no way to find K + ;// ToLog("W1 and W2 collides in mirror"); + else + { + printf("Collision Error\r\n"); + gTotalErrors++; + } + continue; + } + gSolved = true; + break; + } + } +} + +void ShowStats(u64 tm_start, double exp_ops, double dp_val) +{ +#ifdef DEBUG_MODE + for (int i = 0; i <= MD_LEN; i++) + { + u64 val = 0; + for (int j = 0; j < GpuCnt; j++) + { + val += GpuKangs[j]->dbg[i]; + } + if (val) + printf("Loop size %d: %llu\r\n", i, val); + } +#endif + + int speed = GpuKangs[0]->GetStatsSpeed(); + for (int i = 1; i < GpuCnt; i++) + speed += GpuKangs[i]->GetStatsSpeed(); + + u64 est_dps_cnt = (u64)(exp_ops / dp_val); + u64 exp_sec = 0xFFFFFFFFFFFFFFFFull; + if (speed) + exp_sec = (u64)((exp_ops / 1000000) / speed); //in sec + u64 exp_days = exp_sec / (3600 * 24); + int exp_hours = (int)(exp_sec - exp_days * (3600 * 24)) / 3600; + int exp_min = (int)(exp_sec - exp_days * (3600 * 24) - exp_hours * 3600) / 60; + + u64 sec = (GetTickCount64() - tm_start) / 1000; + u64 days = sec / (3600 * 24); + int hours = (int)(sec - days * (3600 * 24)) / 3600; + int min = (int)(sec - days * (3600 * 24) - hours * 3600) / 60; + + printf("%sSpeed: %d MKeys/s, Err: %d, DPs: %lluK/%lluK, Time: %llud:%02dh:%02dm/%llud:%02dh:%02dm\r\n", gGenMode ? "GEN: " : (IsBench ? "BENCH: " : "MAIN: "), speed, gTotalErrors, db.GetBlockCnt()/1000, est_dps_cnt/1000, days, hours, min, exp_days, exp_hours, exp_min); +} + +bool SolvePoint(EcPoint PntToSolve, int Range, int DP, EcInt* pk_res) +{ + if ((Range < 32) || (Range > 180)) + { + printf("Unsupported Range value (%d)!\r\n", Range); + return false; + } + if ((DP < 14) || (DP > 60)) + { + printf("Unsupported DP value (%d)!\r\n", DP); + return false; + } + + printf("\r\nSolving point: Range %d bits, DP %d, start...\r\n", Range, DP); + double ops = 1.15 * pow(2.0, Range / 2.0); + double dp_val = (double)(1ull << DP); + double ram = (32 + 4 + 4) * ops / dp_val; //+4 for grow allocation and memory fragmentation + ram += sizeof(TListRec) * 256 * 256 * 256; //3byte-prefix table + ram /= (1024 * 1024 * 1024); //GB + printf("SOTA method, estimated ops: 2^%.3f, RAM for DPs: %.3f GB. DP and GPU overheads not included!\r\n", log2(ops), ram); + gIsOpsLimit = false; + double MaxTotalOps = 0.0; + if (gMax > 0) + { + MaxTotalOps = gMax * ops; + double ram_max = (32 + 4 + 4) * MaxTotalOps / dp_val; //+4 for grow allocation and memory fragmentation + ram_max += sizeof(TListRec) * 256 * 256 * 256; //3byte-prefix table + ram_max /= (1024 * 1024 * 1024); //GB + printf("Max allowed number of ops: 2^%.3f, max RAM for DPs: %.3f GB\r\n", log2(MaxTotalOps), ram_max); + } + + u64 total_kangs = GpuKangs[0]->CalcKangCnt(); + for (int i = 1; i < GpuCnt; i++) + total_kangs += GpuKangs[i]->CalcKangCnt(); + double path_single_kang = ops / total_kangs; + double DPs_per_kang = path_single_kang / dp_val; + printf("Estimated DPs per kangaroo: %.3f.%s\r\n", DPs_per_kang, (DPs_per_kang < 5) ? " DP overhead is big, use less DP value if possible!" : ""); + + if (!gGenMode && gTamesFileName[0]) + { + printf("load tames...\r\n"); + if (db.LoadFromFile(gTamesFileName)) + { + printf("tames loaded\r\n"); + if (db.Header[0] != gRange) + { + printf("loaded tames have different range, they cannot be used, clear\r\n"); + db.Clear(); + } + } + else + printf("tames loading failed\r\n"); + } + + SetRndSeed(0); //use same seed to make tames from file compatible + PntTotalOps = 0; + PntIndex = 0; +//prepare jumps + EcInt minjump, t; + minjump.Set(1); + minjump.ShiftLeft(Range / 2 + 3); + for (int i = 0; i < JMP_CNT; i++) + { + EcJumps1[i].dist = minjump; + t.RndMax(minjump); + EcJumps1[i].dist.Add(t); + EcJumps1[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even + EcJumps1[i].p = ec.MultiplyG(EcJumps1[i].dist); + } + + minjump.Set(1); + minjump.ShiftLeft(Range - 10); //large jumps for L1S2 loops. Must be almost RANGE_BITS + for (int i = 0; i < JMP_CNT; i++) + { + EcJumps2[i].dist = minjump; + t.RndMax(minjump); + EcJumps2[i].dist.Add(t); + EcJumps2[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even + EcJumps2[i].p = ec.MultiplyG(EcJumps2[i].dist); + } + + minjump.Set(1); + minjump.ShiftLeft(Range - 10 - 2); //large jumps for loops >2 + for (int i = 0; i < JMP_CNT; i++) + { + EcJumps3[i].dist = minjump; + t.RndMax(minjump); + EcJumps3[i].dist.Add(t); + EcJumps3[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even + EcJumps3[i].p = ec.MultiplyG(EcJumps3[i].dist); + } + SetRndSeed(GetTickCount64()); + + Int_HalfRange.Set(1); + Int_HalfRange.ShiftLeft(Range - 1); + Pnt_HalfRange = ec.MultiplyG(Int_HalfRange); + Pnt_NegHalfRange = Pnt_HalfRange; + Pnt_NegHalfRange.y.NegModP(); + Int_TameOffset.Set(1); + Int_TameOffset.ShiftLeft(Range - 1); + EcInt tt; + tt.Set(1); + tt.ShiftLeft(Range - 5); //half of tame range width + Int_TameOffset.Sub(tt); + gPntToSolve = PntToSolve; + +//prepare GPUs + for (int i = 0; i < GpuCnt; i++) + if (!GpuKangs[i]->Prepare(PntToSolve, Range, DP, EcJumps1, EcJumps2, EcJumps3)) + { + GpuKangs[i]->Failed = true; + printf("GPU %d Prepare failed\r\n", GpuKangs[i]->CudaIndex); + } + + u64 tm0 = GetTickCount64(); + printf("GPUs started...\r\n"); + +#ifdef _WIN32 + HANDLE thr_handles[MAX_GPU_CNT]; +#else + pthread_t thr_handles[MAX_GPU_CNT]; +#endif + + u32 ThreadID; + gSolved = false; + ThrCnt = GpuCnt; + for (int i = 0; i < GpuCnt; i++) + { +#ifdef _WIN32 + thr_handles[i] = (HANDLE)_beginthreadex(NULL, 0, kang_thr_proc, (void*)GpuKangs[i], 0, &ThreadID); +#else + pthread_create(&thr_handles[i], NULL, kang_thr_proc, (void*)GpuKangs[i]); +#endif + } + + u64 tm_stats = GetTickCount64(); + while (!gSolved) + { + CheckNewPoints(); + Sleep(10); + if (GetTickCount64() - tm_stats > 10 * 1000) + { + ShowStats(tm0, ops, dp_val); + tm_stats = GetTickCount64(); + } + + if ((MaxTotalOps > 0.0) && (PntTotalOps > MaxTotalOps)) + { + gIsOpsLimit = true; + printf("Operations limit reached\r\n"); + break; + } + } + + printf("Stopping work ...\r\n"); + for (int i = 0; i < GpuCnt; i++) + GpuKangs[i]->Stop(); + while (ThrCnt) + Sleep(10); + for (int i = 0; i < GpuCnt; i++) + { +#ifdef _WIN32 + CloseHandle(thr_handles[i]); +#else + pthread_join(thr_handles[i], NULL); +#endif + } + + if (gIsOpsLimit) + { + if (gGenMode) + { + printf("saving tames...\r\n"); + db.Header[0] = gRange; + if (db.SaveToFile(gTamesFileName)) + printf("tames saved\r\n"); + else + printf("tames saving failed\r\n"); + } + db.Clear(); + return false; + } + + double K = (double)PntTotalOps / pow(2.0, Range / 2.0); + printf("Point solved, K: %.3f (with DP and GPU overheads)\r\n\r\n", K); + db.Clear(); + *pk_res = gPrivKey; + return true; +} + +bool ParseCommandLine(int argc, char* argv[]) +{ + int ci = 1; + while (ci < argc) + { + char* argument = argv[ci]; + ci++; + if (strcmp(argument, "-gpu") == 0) + { + if (ci >= argc) + { + printf("error: missed value after -gpu option\r\n"); + return false; + } + char* gpus = argv[ci]; + ci++; + memset(gGPUs_Mask, 0, sizeof(gGPUs_Mask)); + for (int i = 0; i < (int)strlen(gpus); i++) + { + if ((gpus[i] < '0') || (gpus[i] > '9')) + { + printf("error: invalid value for -gpu option\r\n"); + return false; + } + gGPUs_Mask[gpus[i] - '0'] = 1; + } + } + else + if (strcmp(argument, "-dp") == 0) + { + int val = atoi(argv[ci]); + ci++; + if ((val < 14) || (val > 60)) + { + printf("error: invalid value for -dp option\r\n"); + return false; + } + gDP = val; + } + else + if (strcmp(argument, "-range") == 0) + { + int val = atoi(argv[ci]); + ci++; + if ((val < 32) || (val > 170)) + { + printf("error: invalid value for -range option\r\n"); + return false; + } + gRange = val; + } + else + if (strcmp(argument, "-start") == 0) + { + if (!gStart.SetHexStr(argv[ci])) + { + printf("error: invalid value for -start option\r\n"); + return false; + } + ci++; + gStartSet = true; + } + else + if (strcmp(argument, "-pubkey") == 0) + { + if (!gPubKey.SetHexStr(argv[ci])) + { + printf("error: invalid value for -pubkey option\r\n"); + return false; + } + ci++; + } + else + if (strcmp(argument, "-tame-ratio") == 0) + { + if (ci >= argc) { printf("error: missed value after -tame-ratio option\r\n"); return false; } + int ratio = atoi(argv[ci]); ci++; + if (ratio < 1 || ratio > 90) { printf("error: invalid value for -tame-ratio (1..90)\r\n"); return false; } + gTameRatioPct = ratio; + } + else + if (strcmp(argument, "-tame-bits") == 0) + { + if (ci >= argc) { printf("error: missed value after -tame-bits option\r\n"); return false; } + int bits = atoi(argv[ci]); ci++; + if (bits < 1 || bits > 32) { printf("error: invalid value for -tame-bits (1..32)\r\n"); return false; } + gTameBitsOffset = bits; + } + + else + if (strcmp(argument, "-tames") == 0) + { + if (ci >= argc) { printf("error: missed value after -tames option\r\n"); return false; } + strncpy(gTamesFileName, argv[ci], sizeof(gTamesFileName)-1); + gTamesFileName[sizeof(gTamesFileName)-1] = 0; + ci++; + } + else + if (strcmp(argument, "-max") == 0) + { + if (ci >= argc) { printf("error: missed value after -max option\r\n"); return false; } + double val = atof(argv[ci]); ci++; + if (!(val > 0.0)) { printf("error: invalid value for -max option\r\n"); return false; } + gMax = val; + } +else + { + printf("error: unknown option %s\r\n", argument); + return false; + } + } + if (!gPubKey.x.IsZero()) + if (!gStartSet || !gRange || !gDP) + { + printf("error: you must also specify -dp, -range and -start options\r\n"); + return false; + } + if (gTamesFileName[0] && !IsFileExist(gTamesFileName)) + { + if (gMax == 0.0) + { + printf("error: you must also specify -max option to generate tames\r\n"); + return false; + } + gGenMode = true; + } + return true; +} + +int main(int argc, char* argv[]) +{ +#ifdef _DEBUG + _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); +#endif + + printf("********************************************************************************\r\n"); + printf("* RCKangaroo v3.0 (c) 2024 RetiredCoder *\r\n"); + printf("********************************************************************************\r\n\r\n"); + + printf("This software is free and open-source: https://github.com/RetiredC\r\n"); + printf("It demonstrates fast GPU implementation of SOTA Kangaroo method for solving ECDLP\r\n"); + +#ifdef _WIN32 + printf("Windows version\r\n"); +#else + printf("Linux version\r\n"); +#endif + +#ifdef DEBUG_MODE + printf("DEBUG MODE\r\n\r\n"); +#endif + + InitEc(); + gDP = 0; + gRange = 0; + gStartSet = false; + gTamesFileName[0] = 0; + gMax = 0.0; + gGenMode = false; + gIsOpsLimit = false; + memset(gGPUs_Mask, 1, sizeof(gGPUs_Mask)); + if (!ParseCommandLine(argc, argv)) + return 0; + + InitGpus(); + + if (!GpuCnt) + { + printf("No supported GPUs detected, exit\r\n"); + return 0; + } + + pPntList = (u8*)malloc(MAX_CNT_LIST * GPU_DP_SIZE); + pPntList2 = (u8*)malloc(MAX_CNT_LIST * GPU_DP_SIZE); + TotalOps = 0; + TotalSolved = 0; + gTotalErrors = 0; + IsBench = gPubKey.x.IsZero(); + + if (!IsBench && !gGenMode) + { + printf("\r\nMAIN MODE\r\n\r\n"); + EcPoint PntToSolve, PntOfs; + EcInt pk, pk_found; + + PntToSolve = gPubKey; + if (!gStart.IsZero()) + { + PntOfs = ec.MultiplyG(gStart); + PntOfs.y.NegModP(); + PntToSolve = ec.AddPoints(PntToSolve, PntOfs); + } + + char sx[100], sy[100]; + gPubKey.x.GetHexStr(sx); + gPubKey.y.GetHexStr(sy); + printf("Solving public key\r\nX: %s\r\nY: %s\r\n", sx, sy); + gStart.GetHexStr(sx); + printf("Offset: %s\r\n", sx); + + if (!SolvePoint(PntToSolve, gRange, gDP, &pk_found)) + { + if (!gIsOpsLimit) + printf("FATAL ERROR: SolvePoint failed\r\n"); + goto label_end; + } + pk_found.AddModP(gStart); + EcPoint tmp = ec.MultiplyG(pk_found); + if (!tmp.IsEqual(gPubKey)) + { + printf("FATAL ERROR: SolvePoint found incorrect key\r\n"); + goto label_end; + } + //happy end + char s[100]; + pk_found.GetHexStr(s); + printf("\r\nPRIVATE KEY: %s\r\n\r\n", s); + FILE* fp = fopen("RESULTS.TXT", "a"); + if (fp) + { + fprintf(fp, "PRIVATE KEY: %s\n", s); + fclose(fp); + } + else //we cannot save the key, show error and wait forever so the key is displayed + { + printf("WARNING: Cannot save the key to RESULTS.TXT!\r\n"); + while (1) + Sleep(100); + } + } + else + { + if (gGenMode) + printf("\r\nTAMES GENERATION MODE\r\n"); + else + printf("\r\nBENCHMARK MODE\r\n"); + //solve points, show K + while (1) + { + EcInt pk, pk_found; + EcPoint PntToSolve; + + if (!gRange) + gRange = 78; + if (!gDP) + gDP = 16; + + //generate random pk + pk.RndBits(gRange); + PntToSolve = ec.MultiplyG(pk); + + if (!SolvePoint(PntToSolve, gRange, gDP, &pk_found)) + { + if (!gIsOpsLimit) + printf("FATAL ERROR: SolvePoint failed\r\n"); + break; + } + if (!pk_found.IsEqual(pk)) + { + printf("FATAL ERROR: Found key is wrong!\r\n"); + break; + } + TotalOps += PntTotalOps; + TotalSolved++; + u64 ops_per_pnt = TotalOps / TotalSolved; + double K = (double)ops_per_pnt / pow(2.0, gRange / 2.0); + printf("Points solved: %d, average K: %.3f (with DP and GPU overheads)\r\n", TotalSolved, K); + //if (TotalSolved >= 100) break; //dbg + } + } +label_end: + for (int i = 0; i < GpuCnt; i++) + delete GpuKangs[i]; + DeInitEc(); + free(pPntList2); + free(pPntList); +} diff --git a/RCKangaroo.sln b/RCKangaroo.sln deleted file mode 100644 index f5d8c2a..0000000 --- a/RCKangaroo.sln +++ /dev/null @@ -1,28 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.12.35514.174 d17.12 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RCKangaroo", "RCKangaroo.vcxproj", "{B7EF30AA-1D02-4EC4-A835-08766EC4A094}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Debug|x86 = Debug|x86 - Release|x64 = Release|x64 - Release|x86 = Release|x86 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Debug|x64.ActiveCfg = Debug|x64 - {B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Debug|x64.Build.0 = Debug|x64 - {B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Debug|x86.ActiveCfg = Debug|Win32 - {B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Debug|x86.Build.0 = Debug|Win32 - {B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Release|x64.ActiveCfg = Release|x64 - {B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Release|x64.Build.0 = Release|x64 - {B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Release|x86.ActiveCfg = Release|Win32 - {B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Release|x86.Build.0 = Release|Win32 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal diff --git a/RCKangaroo.vcxproj b/RCKangaroo.vcxproj deleted file mode 100644 index 53916d9..0000000 --- a/RCKangaroo.vcxproj +++ /dev/null @@ -1,173 +0,0 @@ - - - - - Debug - Win32 - - - Release - Win32 - - - Debug - x64 - - - Release - x64 - - - - 17.0 - Win32Proj - {b7ef30aa-1d02-4ec4-a835-08766ec4a094} - RCKangaroo - 10.0 - - - - Application - true - v143 - Unicode - - - Application - false - v143 - true - Unicode - - - Application - true - v143 - MultiByte - - - Application - false - v143 - true - MultiByte - Static - - - - - - - - - - - - - - - - - - - - - - - Level3 - true - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - true - - - Console - true - - - - - Level3 - true - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - true - - - Console - true - true - true - - - - - Level3 - true - _DEBUG;_CONSOLE;%(PreprocessorDefinitions) - true - true - EnableFastChecks - StreamingSIMDExtensions2 - - - Console - true - - - compute_89,sm_89;compute_86,sm_86;compute_75,sm_75;compute_61,sm_61 - 0 - false - - - - - Level3 - true - true - true - NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - true - true - StreamingSIMDExtensions2 - - - Console - true - true - true - - - compute_89,sm_89;compute_86,sm_86;compute_75,sm_75;compute_61,sm_61 - - - - - Disabled - Default - Speed - ProgramDatabase - - - - - - - - - - - - - - - false - true - true - - - - - - - \ No newline at end of file diff --git a/RCKangaroo.vcxproj.user b/RCKangaroo.vcxproj.user deleted file mode 100644 index ac7bdd1..0000000 --- a/RCKangaroo.vcxproj.user +++ /dev/null @@ -1,11 +0,0 @@ - - - - -gpu 0 -tames tames78.dat -max 10 - WindowsLocalDebugger - - - -gpu 0 -dp 16 -range 84 -start 1000000000000000000000 -pubkey 0329c4574a4fd8c810b7e42a4b398882b381bcd85e40c6883712912d167c83e73a - WindowsLocalDebugger - - \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index b272088..0000000 --- a/README.md +++ /dev/null @@ -1,75 +0,0 @@ -(c) 2024, RetiredCoder (RC) - -RCKangaroo is free and open-source (GPLv3). -This software demonstrates efficient GPU implementation of SOTA Kangaroo method for solving ECDLP. -It's part #3 of my research, you can find more details here: https://github.com/RetiredC - -Discussion thread: https://bitcointalk.org/index.php?topic=5517607 - -Features: - -- Lowest K=1.15, it means 1.8 times less required operations compared to classic method with K=2.1, also it means that you need 1.8 times less memory to store DPs. -- Fast, about 8GKeys/s on RTX 4090, 4GKeys/s on RTX 3090. -- Keeps DP overhead as small as possible. -- Supports ranges up to 170 bits. -- Both Windows and Linux are supported. - -Limitations: - -- No advanced features like networking, saving/loading DPs, etc. - -Command line parameters: - --gpu which GPUs are used, for example, "035" means that GPUs #0, #3 and #5 are used. If not specified, all available GPUs are used. - --pubkey public key to solve, both compressed and uncompressed keys are supported. If not specified, software starts in benchmark mode and solves random keys. - --start start offset of the key, in hex. Mandatory if "-pubkey" option is specified. For example, for puzzle #85 start offset is "1000000000000000000000". - --range bit range of private the key. Mandatory if "-pubkey" option is specified. For example, for puzzle #85 bit range is "84" (84 bits). Must be in range 32...170. - --dp DP bits. Must be in range 14...60. Low DP bits values cause larger DB but reduces DP overhead and vice versa. - --max option to limit max number of operations. For example, value 5.5 limits number of operations to 5.5 * 1.15 * sqrt(range), software stops when the limit is reached. - --tames filename with tames. If file not found, software generates tames (option "-max" is required) and saves them to the file. If the file is found, software loads tames to speedup solving. - -When public key is solved, software displays it and also writes it to "RESULTS.TXT" file. - -Sample command line for puzzle #85: - -RCKangaroo.exe -dp 16 -range 84 -start 1000000000000000000000 -pubkey 0329c4574a4fd8c810b7e42a4b398882b381bcd85e40c6883712912d167c83e73a - -Sample command to generate tames: - -RCKangaroo.exe -dp 16 -range 76 -tames tames76.dat -max 10 - -Then you can restart software with same parameters to see less K in benchmark mode or add "-tames tames76.dat" to solve some public key in 76-bit range faster. - -Some notes: - -Fastest ECDLP solvers will always use SOTA/SOTA+ method, as it's 1.4/1.5 times faster and requires less memory for DPs compared to the best 3-way kangaroos with K=1.6. -Even if you already have a faster implementation of kangaroo jumps, incorporating SOTA method will improve it further. -While adding the necessary loop-handling code will cause you to lose about 5–15% of your current speed, the SOTA method itself will provide a 40% performance increase. -Overall, this translates to roughly a 25% net improvement, which should not be ignored if your goal is to build a truly fast solver. - - -Changelog: - -v3.0: - -- added "-tames" and "-max" options. -- fixed some bugs. - -v2.0: - -- added support for 30xx, 20xx and 1xxx cards. -- some minor changes. - -v1.1: - -- added ability to start software on 30xx cards. - -v1.0: - -- initial release. \ No newline at end of file diff --git a/README_es_v16.md b/README_es_v16.md new file mode 100644 index 0000000..40aa0fa --- /dev/null +++ b/README_es_v16.md @@ -0,0 +1,228 @@ +# RCKangaroo v1.5 (build local) + +Implementación de *Kangaroo ECDLP* acelerada por GPU con varias mejoras +orientadas a **tiempo de cómputo** y **uso de memoria / I/O**. + +> Probado con CUDA 12.x y NVIDIA RTX 3060 (SM 8.6). +> Esta rama mantiene la **CLI original** y añade banderas/guías para *tame tuning* y *benchmarks* + + +## Novedades técnicas (V1.5) + +1) **Coordenadas Jacobianas en GPU** (opt-in) + - Suma/doblado en Jacobiano para evitar inversiones modulares por paso. + - Conversión a afin **solo cuando hace falta** (p.ej. para DPs o salida). + - Add mixta (*Jacobian + Affine precomp*) para puntos de salto. + - Conmutador de compilación: `USE_JACOBIAN=1` (habilitado en build por defecto de esta rama). + +2) **Inversión por lotes (Truco de Montgomery)** + - Se invierten muchos `Z` con **una sola inversión** y productos hacia delante/atrás. + - Útil en compactación/normalización de estados y/o verificaciones masivas. + +3) **TAMES v1.5 – formato compacto** + - **~30–35% menos tamaño** vs. formato clásico en nuestros tests (p.ej. 84 MB → 57 MB). + - Layout contiguo + compresión ligera (delta + varint/RLE) y lectura *streaming*. + - Carga más rápida y menos *pressure* de caché/L2/PCIe. + - **Compatibilidad**: el binario sigue aceptando el formato anterior; si el archivo no es v1.5, se lee por la ruta clásica. + +4) **Menos I/O y binario optimizado** + - Flags `-ffunction-sections -fdata-sections` (host) y `-Xfatbin=-compress-all` (device). + - Caché L1/tex en *ptxas* via `-Xptxas -dlcm=ca` en `build.sh`. + +> Nota: la *Montgomery Ladder* también está disponible en el código, pero no se fuerza por CLI; se usó Jacobiano + ventanas clásicas/mixtas, que mostraron mejor relación velocidad/uso de recursos en Ampere. + +--- + +--- + +## 🚀 Novedades en v1.6 + +### Mejoras en GPU +- **Atómicas warp-aggregadas en emisión de DPs**: reduce de 32 atómicas por warp a 1, con escrituras coalescentes. **+10–30% rendimiento** según GPU y -dp. +- **Mejor coalescencia de memoria** en DPs y transferencias PCIe. + +### Nuevo formato `.dat` (v1.6) +- **28B por registro DP** (vs 32B en v1.5). + - Cola de X: 5 bytes (antes 9). + - Distancia: 22 bytes. + - Tipo: 1 byte. +- **Etiqueta de archivo `TMBM16`** identifica el nuevo formato. +- **Compatibilidad hacia atrás**: lectura de v1.5 y v1.6. + +### Benchmarks (RTX 3060) +- v1.5: ~750 MKey/s @ -dp 16. +- v1.6: ~870 MKey/s @ -dp 16. +- ~16% más rápido y ~12.5% menos tamaño en `.dat`. + + + +## Archivos modificados / añadidos + +- **`RCGpuCore.cu`** + Implementaciones Jacobianas (doble/suma mixta), camino de *batch inversion* y selección de kernels según `USE_JACOBIAN`. + +- **`RCGpuUtils.h`** + Primitivas de campo y helpers para Jacobiano (doble / add mixed). + +- **`utils.h`, `utils.cpp`** + - Nueva ruta de **lectura/escritura TAMES v1.5** (streaming, compacta). + - Limpieza de utilidades y helpers varios. + +- **`GpuKang.cpp`, `GpuKang.h`** + - Parámetros de *tame tuning* (ratio y bits) expuestos para benchs controlados. + - Generación de distancias y partición *tame/wild* estable. + +- **`RCKangaroo.cpp`** + - Parsing de CLI y *guard-rails* (mensajes de error consistentes). + - Modo *bench* más verboso. + +- **`Makefile`** + - Objetivo directo para `rckangaroo` (sin librerías intermedias). + - Soporte `SM`, `USE_JACOBIAN`, `PROFILE` y *linking* determinista. + +- **Scripts de apoyo** + - `build.sh` – *wrapper* de compilación multi-SM. + - `bench_grid.sh` – *sweep* de parámetros (dp / tame-bits / tame-ratio) con repetición y logs. + - `bench_rck.sh` – *benchmark* de A/B rápido. + - `summarize_bench.py` – parser de logs → CSV (speed, tiempo real, RSS, parámetros). + +--- + +## Árbol del proyecto (esta rama) + +``` +. +├── logs/ # salida de bench_grid.sh +├── bench_grid.sh +├── bench_rck.sh +├── build.sh +├── Makefile +├── defs.h +├── Ec.cpp +├── Ec.h +├── GpuKang.cpp +├── GpuKang.h +├── RCGpuCore.cu +├── RCGpuUtils.h +├── RCKangaroo.cpp +├── rckangaroo # binario (tras build) +├── summarize_bench.py +├── tames71.dat # ejemplo formato clásico +├── tames71_v15.dat # ejemplo formato v1.5 (compacto) +├── utils.cpp +└── utils.h +``` + +--- + +## Compilación + +### Opción A – `build.sh` (recomendada) +```bash +# Sintaxis: ./build.sh +./build.sh 86 1 release # RTX 3060 (SM 8.6), Jacobiano ON +./build.sh 86 0 release # Jacobiano OFF (afin) para A/B +``` +Genera `./rckangaroo` en el directorio actual. + +### Opción B – `make` +```bash +# Variables: SM, USE_JACOBIAN, PROFILE=(release|debug) +make SM=86 USE_JACOBIAN=1 PROFILE=release -j +``` + +> Requisitos: CUDA 12+, `g++` C++17, driver suficiente para la SM objetivo. + + +--- + +## Modo de uso (CLI) + +Ejemplo mínimo (con TAMES v1.5): +```bash +./rckangaroo \ + -pubkey 0290e6900a58d33393bc1097b5aed31f2e4e7cbd3e5466af958665bc0121248483 \ + -range 71 \ + -dp 16 \ + -start 0 \ + -tames tames71_v15.dat +``` + +Parámetros útiles de *tame tuning* (se pasan por CLI y se reflejan en logs de bench): +``` + -tame-bits # bits usados para los saltos "tame" (p.ej. 4–7) + -tame-ratio # porcentaje de canguros tame (p.ej. 25–50) +``` +Ejemplo: +```bash +./rckangaroo ... -tame-bits 4 -tame-ratio 33 +``` + +> Sugerencia: buscar combinaciones que **maximicen MKeys/s** pero con **menor tiempo real** y **memoria** aceptable. + + +--- + +## Benchmarks automatizados + +### Barrido de parámetros (grilla) +```bash +# Editar cabezal del archivo para ajustar PUBKEY/RANGE/DP/TAMES/etc. +chmod +x bench_grid.sh summarize_bench.py + +# Ejecutar barrido (graba todo en logs/) +./bench_grid.sh + +# Resumir a CSV y visualizar +python3 summarize_bench.py logs > summary.csv +column -s, -t < summary.csv | less -S +``` +Comparativa Jacobiano OFF/ON: +```bash +# Jacobiano ON +./build.sh 86 1 release && MODE_TAG="j1" ./bench_grid.sh +python3 summarize_bench.py logs > summary_j1.csv + +# Jacobiano OFF +./build.sh 86 0 release && MODE_TAG="j0" ./bench_grid.sh +python3 summarize_bench.py logs > summary_j0.csv +``` + +> **TIP**: Dejá `REPEATS>=5` para mitigar jitter; el parser reporta **medianas** por combinación. + + +--- + +## Resultados de referencia (orientativos) + +En pruebas rápidas de 71 bits en RTX 3060: +- **TAMES v1.5**: 84 MB → **57 MB** (~32% menor). +- **Tiempo real**: ~100 s → **~65 s** (Jacobiano + v1.5 + mismos parámetros). +- **RSS**: ligera reducción (≈ -20–30 MB según corrida). + +> Los números varían por DP, *tame-bits*, *tame-ratio*, reloj de la GPU y versión de driver. + + +--- + +## Compatibilidad y notas + +- El binario mantiene la lectura del formato de TAMES **clásico** y del **v1.5** (detectados por cabecera / heurística). +- Si necesitás convertir masivamente a v1.5, se recomienda regenerar con el *pipeline* que usás para crear los tames, apuntando al escritor v1.5 (ver `utils.cpp`). + + +--- + +## Solución de problemas + +- **`Unknown option -ffunction-sections` en NVCC**: usá `build.sh` (pasa por `-Xcompiler`). +- **`No rule to make target 'RCGpuCore.o'`**: asegurate de usar este repositorio / Makefile o `./build.sh`. +- **`CUDA error / cap mismatch`**: compila con `./build.sh ...` (p.ej. 75 para Turing, 86 para Ampere). + + +--- + +## Licencia + +Mantiene la licencia del proyecto original (ver `LICENSE.TXT` si aplica). +Autorizado a ser usado con fines de investigación y educativos. diff --git a/README_v16.md b/README_v16.md new file mode 100644 index 0000000..faf8c43 --- /dev/null +++ b/README_v16.md @@ -0,0 +1,221 @@ +# RCKangaroo v1.5 (local build) + +GPU-accelerated **Kangaroo ECDLP** implementation with improvements focused on +**compute time** and **memory / I/O** efficiency. + +> Tested with CUDA 12.x and NVIDIA RTX 3060 (SM 8.6). +> This branch keeps the **original CLI** and adds flags/guides for *tame tuning* and *benchmarks*. + +--- + +--- + +## 🚀 What’s New in v1.6 + +### GPU Improvements +- **Warp-aggregated atomics for DP emission**: reduced per-thread atomics to a single warp-level atomic, coalesced writes. **+10–30% throughput** depending on GPU and -dp. +- **Better memory coalescing** for DPs and PCIe transfers. + +### New `.dat` Format (v1.6) +- **28B per DP record** (vs 32B in v1.5). + - X tail: 5 bytes (was 9). + - Distance: 22 bytes. + - Type: 1 byte. +- **File tag `TMBM16`** identifies new format. +- **Backward compatible**: reads both v1.5 and v1.6. + +### Benchmarks (RTX 3060) +- v1.5: ~750 MKey/s @ -dp 16. +- v1.6: ~870 MKey/s @ -dp 16. +- ~16% faster and ~12.5% smaller `.dat` files. + + + +## Technical Highlights (V1.5) + +1) **Jacobian Coordinates on GPU** (opt‑in) + - Point add/double in Jacobian to avoid modular inversions per step. + - Convert back to affine **only when needed** (e.g., DPs or output). + - Mixed add (*Jacobian + Affine precomp*) for jump points. + - Build switch: `USE_JACOBIAN=1` (enabled by default in this branch). + +2) **Batch Inversion (Montgomery trick)** + - Invert many `Z` values with **a single** field inversion using forward/backward products. + - Useful for compacting/normalizing states and bulk verifications. + +3) **TAMES v1.5 – compact file format** + - **~30–35% smaller** than the classic layout in our tests (e.g., 84 MB → 57 MB). + - Contiguous layout + light compression (delta + varint/RLE) with streaming reads. + - Faster load and lower cache/L2/PCIe pressure. + - **Compatibility**: the binary still accepts the classic format; if the file is not v1.5, it uses the legacy path. + +4) **Less I/O and optimized binary size** + - Host flags `-ffunction-sections -fdata-sections`; device fatbin compression `-Xfatbin=-compress-all`. + - L1/tex cache hint via `-Xptxas -dlcm=ca` in `build.sh`. + +> Note: *Montgomery Ladder* is available in code but not enforced via CLI; Jacobian + classic/mixed windows showed a better perf/resource balance on Ampere. + +--- + +## Modified / Added Files + +- **`RCGpuCore.cu`** + Jacobian implementations (double/mixed-add), batch inversion path, and kernel selection via `USE_JACOBIAN`. + +- **`RCGpuUtils.h`** + Field primitives and helpers for Jacobian (double / mixed add). + +- **`utils.h`, `utils.cpp`** + - New **TAMES v1.5 reader/writer** (streaming, compact). + - Utility cleanups. + +- **`GpuKang.cpp`, `GpuKang.h`** + - Exposed *tame tuning* parameters (ratio and bits) for controlled benches. + - Distance generation and stable tame/wild partitioning. + +- **`RCKangaroo.cpp`** + - CLI parsing + guard rails (consistent error messages). + - More verbose bench output. + +- **`Makefile`** + - Direct `rckangaroo` target (no intermediate archives). + - Support for `SM`, `USE_JACOBIAN`, `PROFILE`, and deterministic linking. + +- **Helper scripts** + - `build.sh` – multi‑SM build wrapper. + - `bench_grid.sh` – parameter sweep (dp / tame-bits / tame-ratio) with repeats and logs. + - `bench_rck.sh` – quick A/B benchmark. + - `summarize_bench.py` – log parser → CSV (speed, wall time, RSS, parameters). + +--- + +## Project Tree (this branch) + +``` +. +├── logs/ # output from bench_grid.sh +├── bench_grid.sh +├── bench_rck.sh +├── build.sh +├── Makefile +├── defs.h +├── Ec.cpp +├── Ec.h +├── GpuKang.cpp +├── GpuKang.h +├── RCGpuCore.cu +├── RCGpuUtils.h +├── RCKangaroo.cpp +├── rckangaroo # binary after build +├── summarize_bench.py +├── tames71.dat # classic format example +├── tames71_v15.dat # v1.5 compact example +├── utils.cpp +└── utils.h +``` + +--- + +## Build + +### Option A – `build.sh` (recommended) +```bash +# Syntax: ./build.sh +./build.sh 86 1 release # RTX 3060 (SM 8.6), Jacobian ON +./build.sh 86 0 release # Jacobian OFF (affine) for A/B +``` +Produces `./rckangaroo` in the current directory. + +### Option B – `make` +```bash +# Variables: SM, USE_JACOBIAN, PROFILE=(release|debug) +make SM=86 USE_JACOBIAN=1 PROFILE=release -j +``` + +> Requirements: CUDA 12+, `g++` with C++17, and a driver supporting your target SM. + + +--- + +## Usage (CLI) + +Minimal example (with TAMES v1.5): +```bash +./rckangaroo \ + -pubkey 0290e6900a58d33393bc1097b5aed31f2e4e7cbd3e5466af958665bc0121248483 \ + -range 71 \ + -dp 16 \ + -start 0 \ + -tames tames71_v15.dat +``` + +Tame tuning parameters (reflected in bench logs): +``` + -tame-bits # bits used for tame jumps (e.g., 4–7) + -tame-ratio # percent of tame kangaroos (e.g., 25–50) +``` +Example: +```bash +./rckangaroo ... -tame-bits 4 -tame-ratio 33 +``` + +> Goal: find combinations that **maximize MKeys/s** while minimizing **wall time** and keeping **memory** acceptable. + + +--- + +## Automated Benchmarks + +### Parameter sweep (grid) +```bash +# Edit the header of the script to set PUBKEY/RANGE/DP/TAMES/etc. +chmod +x bench_grid.sh summarize_bench.py + +# Run grid (stores everything under logs/) +./bench_grid.sh + +# Summarize to CSV +python3 summarize_bench.py logs > summary.csv +column -s, -t < summary.csv | less -S +``` +Jacobian OFF/ON comparison: +```bash +# Jacobian ON +./build.sh 86 1 release && MODE_TAG="j1" ./bench_grid.sh +python3 summarize_bench.py logs > summary_j1.csv + +# Jacobian OFF +./build.sh 86 0 release && MODE_TAG="j0" ./bench_grid.sh +python3 summarize_bench.py logs > summary_j0.csv +``` + +> **TIP**: Use `REPEATS>=5` to reduce jitter; the parser reports **medians** per combination. + + +--- + +## Reference Results (indicative) + +Quick 71‑bit tests on an RTX 3060: +- **TAMES v1.5**: 84 MB → **57 MB** (~32% smaller). +- **Wall time**: ~100 s → **~65 s** (Jacobian + v1.5 with same parameters). +- **RSS**: slight reduction (≈ −20–30 MB depending on run). + +> Numbers vary with DP, *tame-bits*, *tame-ratio*, GPU clocks, and driver version. + + +--- + +## Troubleshooting + +- **`Unknown option -ffunction-sections` from NVCC**: use `build.sh` (passes via `-Xcompiler`). +- **`No rule to make target 'RCGpuCore.o'`**: use this repo/Makefile or `./build.sh`. +- **`CUDA error / cap mismatch`**: compile via `./build.sh ...` (e.g., 75 for Turing, 86 for Ampere). + + +--- + +## License + +Inherits the original project’s license (see `LICENSE.TXT` if present). +Permitted for research and educational use. diff --git a/RESULTS.TXT b/RESULTS.TXT new file mode 100644 index 0000000..2ede806 --- /dev/null +++ b/RESULTS.TXT @@ -0,0 +1,4 @@ +PRIVATE KEY: 0000000000000000000000000000000000000000000000349B84B6431A6C4EF1 +PRIVATE KEY: 0000000000000000000000000000000000000000000000349B84B6431A6C4EF1 +PRIVATE KEY: 0000000000000000000000000000000000000000000000101D83275FB2BC7E0C +PRIVATE KEY: 00000000000000000000000000000000000000000000000BEBB3940CD0FC1491 diff --git a/bench_grid.sh b/bench_grid.sh new file mode 100755 index 0000000..8b03c17 --- /dev/null +++ b/bench_grid.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ========= Config por defecto (sobre-escribibles por variables de entorno) ========= +PUBKEY="${PUBKEY:-0290e6900a58d33393bc1097b5aed31f2e4e7cbd3e5466af958665bc0121248483}" +RANGE="${RANGE:-71}" +START="${START:-0}" +TAMES_FILE="${TAMES_FILE:-tames71_v15.dat}" # Cambiá si querés otro archivo +REPEATS="${REPEATS:-5}" +DP_LIST="${DP_LIST:-14 15 16}" +TAME_BITS_LIST="${TAME_BITS_LIST:-4 5}" +TAME_RATIO_LIST="${TAME_RATIO_LIST:-25 33 40}" +MODE_TAG="${MODE_TAG:-j1}" # usa j1/j0 para distinguir Jacobiano ON/OFF +LOGDIR="${LOGDIR:-logs}" + +mkdir -p "$LOGDIR" + +echo "== Bench grid ==" +echo "PUBKEY=$PUBKEY" +echo "RANGE=$RANGE START=$START" +echo "TAMES_FILE=$TAMES_FILE" +echo "REPEATS=$REPEATS" +echo "DP_LIST=$DP_LIST" +echo "TAME_BITS_LIST=$TAME_BITS_LIST" +echo "TAME_RATIO_LIST=$TAME_RATIO_LIST" +echo "MODE_TAG=$MODE_TAG" +echo "LOGDIR=$LOGDIR" +echo + +# Chequeos +if [[ ! -x "./rckangaroo" ]]; then + echo "ERROR: ./rckangaroo no existe o no es ejecutable. Compilá primero." >&2 + exit 1 +fi +if [[ ! -f "$TAMES_FILE" ]]; then + echo "ERROR: no se encontró $TAMES_FILE" >&2 + exit 1 +fi + +run_one() { + local dp="$1" + local tb="$2" + local tr="$3" + local r="$4" + local of="$LOGDIR/${MODE_TAG}_dp${dp}_tb${tb}_tr${tr}_run${r}.log" + + echo ">> dp=$dp tame-bits=$tb tame-ratio=$tr run=$r" + /usr/bin/time -f "%E real %Mk RSS %I in KB %O out KB" \ + ./rckangaroo -pubkey "$PUBKEY" -range "$RANGE" -dp "$dp" -start "$START" \ + -tames "$TAMES_FILE" -tame-bits "$tb" -tame-ratio "$tr" \ + |& tee "$of" +} + +for dp in $DP_LIST; do + for tb in $TAME_BITS_LIST; do + for tr in $TAME_RATIO_LIST; do + for r in $(seq 1 "$REPEATS"); do + run_one "$dp" "$tb" "$tr" "$r" + done + done + done +done + +echo +echo "== Listo. Logs guardados en $LOGDIR" +echo "Sugerencia: python3 summarize_bench.py $LOGDIR > summary_${MODE_TAG}.csv" diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..10ac8bf --- /dev/null +++ b/build.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +set -euo pipefail + +SM="${1:-86}" +USE_JACOBIAN="${2:-1}" +PROFILE="${3:-release}" +CUDA_PATH="${CUDA_PATH:-/usr/local/cuda-12.0}" + +HOST_COPT_release="-O3 -DNDEBUG -ffunction-sections -fdata-sections" +HOST_COPT_debug="-O0 -g" +DEV_COPT_release="-O3" +DEV_COPT_debug="-O0 -g" + +if [[ "$PROFILE" == "release" ]]; then + HOST_COPT="$HOST_COPT_release" + DEV_COPT="$DEV_COPT_release" +else + HOST_COPT="$HOST_COPT_debug" + DEV_COPT="$DEV_COPT_debug" +fi + +CCFLAGS="-std=c++17 -I${CUDA_PATH}/include ${HOST_COPT} -DUSE_JACOBIAN=${USE_JACOBIAN}" +NVCCFLAGS="-std=c++17 -arch=sm_${SM} ${DEV_COPT} -Xptxas -O3 -Xptxas -dlcm=ca -Xfatbin=-compress-all -DUSE_JACOBIAN=${USE_JACOBIAN}" +NVCCXCOMP="-Xcompiler -ffunction-sections -Xcompiler -fdata-sections" +LDFLAGS="-L${CUDA_PATH}/lib64 -lcudart -pthread" + +echo "== CCFLAGS: ${CCFLAGS}" +echo "== NVCCFLAGS: ${NVCCFLAGS} ${NVCCXCOMP}" + +# Compile C++ +g++ ${CCFLAGS} -c RCKangaroo.cpp -o RCKangaroo.o +g++ ${CCFLAGS} -c GpuKang.cpp -o GpuKang.o +g++ ${CCFLAGS} -c Ec.cpp -o Ec.o +g++ ${CCFLAGS} -c utils.cpp -o utils.o + +# Compile CUDA (if present) +if [[ -f "RCGpuCore.cu" ]]; then + /usr/bin/nvcc ${NVCCFLAGS} ${NVCCXCOMP} -c RCGpuCore.cu -o RCGpuCore.o + g++ ${CCFLAGS} -o rckangaroo RCKangaroo.o GpuKang.o Ec.o utils.o RCGpuCore.o ${LDFLAGS} +else + echo "WARN: RCGpuCore.cu no existe; enlazando CPU-only" + g++ ${CCFLAGS} -o rckangaroo RCKangaroo.o GpuKang.o Ec.o utils.o ${LDFLAGS} +fi +echo "== Listo: ./rckangaroo" diff --git a/defs.h b/defs.h index c84e192..724a527 100644 --- a/defs.h +++ b/defs.h @@ -1,97 +1,115 @@ -// This file is a part of RCKangaroo software -// (c) 2024, RetiredCoder (RC) -// License: GPLv3, see "LICENSE.TXT" file -// https://github.com/RetiredC - - -#pragma once - -#pragma warning(disable : 4996) - -typedef unsigned long long u64; -typedef long long i64; -typedef unsigned int u32; -typedef int i32; -typedef unsigned short u16; -typedef short i16; -typedef unsigned char u8; -typedef char i8; - - - -#define MAX_GPU_CNT 32 - -//must be divisible by MD_LEN -#define STEP_CNT 1000 - -#define JMP_CNT 512 - -//use different options for cards older than RTX 40xx -#ifdef __CUDA_ARCH__ - #if __CUDA_ARCH__ < 890 - #define OLD_GPU - #endif - #ifdef OLD_GPU - #define BLOCK_SIZE 512 - //can be 8, 16, 24, 32, 40, 48, 56, 64 - #define PNT_GROUP_CNT 64 - #else - #define BLOCK_SIZE 256 - //can be 8, 16, 24, 32 - #define PNT_GROUP_CNT 24 - #endif -#else //CPU, fake values - #define BLOCK_SIZE 512 - #define PNT_GROUP_CNT 64 -#endif - -// kang type -#define TAME 0 // Tame kangs -#define WILD1 1 // Wild kangs1 -#define WILD2 2 // Wild kangs2 - -#define GPU_DP_SIZE 48 -#define MAX_DP_CNT (256 * 1024) - -#define JMP_MASK (JMP_CNT-1) - -#define DPTABLE_MAX_CNT 16 - -#define MAX_CNT_LIST (512 * 1024) - -#define DP_FLAG 0x8000 -#define INV_FLAG 0x4000 -#define JMP2_FLAG 0x2000 - -#define MD_LEN 10 - -//#define DEBUG_MODE - -//gpu kernel parameters -struct TKparams -{ - u64* Kangs; - u32 KangCnt; - u32 BlockCnt; - u32 BlockSize; - u32 GroupCnt; - u64* L2; - u64 DP; - u32* DPs_out; - u64* Jumps1; //x(32b), y(32b), d(32b) - u64* Jumps2; //x(32b), y(32b), d(32b) - u64* Jumps3; //x(32b), y(32b), d(32b) - u64* JumpsList; //list of all performed jumps, grouped by warp(32) every 8 groups (from PNT_GROUP_CNT). Each jump is 2 bytes: 10bit jump index + flags: INV_FLAG, DP_FLAG, JMP2_FLAG - u32* DPTable; - u32* L1S2; - u64* LastPnts; - u64* LoopTable; - u32* dbg_buf; - u32* LoopedKangs; - bool IsGenMode; //tames generation mode - - u32 KernelA_LDS_Size; - u32 KernelB_LDS_Size; - u32 KernelC_LDS_Size; -}; - +// This file is a part of RCKangaroo software +// (c) 2024, RetiredCoder (RC) +// License: GPLv3, see "LICENSE.TXT" file +// https://github.com/RetiredC + + +#pragma once + +#pragma warning(disable : 4996) + +typedef unsigned long long u64; +typedef long long i64; +typedef unsigned int u32; +typedef int i32; +typedef unsigned short u16; +typedef short i16; +// === Feature flags (performance/algorithm toggles) ============================ +#ifndef USE_JACOBIAN +// 0 = affine + inversión por lotes (actual predeterminado, muy rápido en GPU) +// 1 = coordenadas jacobianas + conversión por lotes a afín sólo para DP +#define USE_JACOBIAN 0 +#endif + +#ifndef SCALARMUL_W +// Ventana para w-NAF (CPU). 4 es un buen equilibrio entre memoria y velocidad. +#define SCALARMUL_W 4 +#endif + +#ifndef USE_MONTGOMERY_LADDER +// 1 para habilitar la multiplicación escalar por Montgomery Ladder (CPU) +#define USE_MONTGOMERY_LADDER 1 +#endif +// ============================================================================ + +typedef unsigned char u8; +typedef char i8; + + + +#define MAX_GPU_CNT 32 + +//must be divisible by MD_LEN +#define STEP_CNT 1000 + +#define JMP_CNT 512 + +//use different options for cards older than RTX 40xx +#ifdef __CUDA_ARCH__ + #if __CUDA_ARCH__ < 890 + #define OLD_GPU + #endif + #ifdef OLD_GPU + #define BLOCK_SIZE 512 + //can be 8, 16, 24, 32, 40, 48, 56, 64 + #define PNT_GROUP_CNT 64 + #else + #define BLOCK_SIZE 256 + //can be 8, 16, 24, 32 + #define PNT_GROUP_CNT 24 + #endif +#else //CPU, fake values + #define BLOCK_SIZE 512 + #define PNT_GROUP_CNT 64 +#endif + +// kang type +#define TAME 0 // Tame kangs +#define WILD1 1 // Wild kangs1 +#define WILD2 2 // Wild kangs2 + +#define GPU_DP_SIZE 48 +#define MAX_DP_CNT (256 * 1024) + +#define JMP_MASK (JMP_CNT-1) + +#define DPTABLE_MAX_CNT 16 + +#define MAX_CNT_LIST (512 * 1024) + +#define DP_FLAG 0x8000 +#define INV_FLAG 0x4000 +#define JMP2_FLAG 0x2000 + +#define MD_LEN 10 + +//#define DEBUG_MODE + +//gpu kernel parameters +struct TKparams +{ + u64* Kangs; + u32 KangCnt; + u32 BlockCnt; + u32 BlockSize; + u32 GroupCnt; + u64* L2; + u64 DP; + u32* DPs_out; + u64* Jumps1; //x(32b), y(32b), d(32b) + u64* Jumps2; //x(32b), y(32b), d(32b) + u64* Jumps3; //x(32b), y(32b), d(32b) + u64* JumpsList; //list of all performed jumps, grouped by warp(32) every 8 groups (from PNT_GROUP_CNT). Each jump is 2 bytes: 10bit jump index + flags: INV_FLAG, DP_FLAG, JMP2_FLAG + u32* DPTable; + u32* L1S2; + u64* LastPnts; + u64* LoopTable; + u32* dbg_buf; + u32* LoopedKangs; + bool IsGenMode; //tames generation mode + + u32 KernelA_LDS_Size; + u32 KernelB_LDS_Size; + u32 KernelC_LDS_Size; +}; + diff --git a/gitignore b/gitignore new file mode 100644 index 0000000..03555f4 --- /dev/null +++ b/gitignore @@ -0,0 +1,43 @@ +# RCKangaroo .gitignore + +# Build artifacts +*.o +*.obj +*.exe +*.out +*.dll +*.so +*.dylib + +# CUDA intermediate files +*.cubin +*.fatbin +*.ptx + +# Generated binaries +RCKangaroo +RCKangaroo.exe +rckangaroo +rckangaroo.exe + +# Logs and debug +*.log +*.tmp +*.bak + +# Data files (distinguished points, etc.) +*.dat +*.bin + +# Editor/OS files +.DS_Store +Thumbs.db +*.swp +*.swo + +# IDE/project configs +.vscode/ +.idea/ +*.user +*.sln +*.vcxproj* diff --git a/rckangaroo b/rckangaroo new file mode 100755 index 0000000..1a7f5dd Binary files /dev/null and b/rckangaroo differ diff --git a/summarize_bench.py b/summarize_bench.py new file mode 100755 index 0000000..677859f --- /dev/null +++ b/summarize_bench.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +import sys, re, statistics, os, glob + +# Extrae el máximo 'Speed: XXX MKeys/s' en cada log (robusto a warmups) +speed_re = re.compile(r"Speed:\s*([0-9]+(?:\.[0-9]+)?)\s*MKeys/s", re.IGNORECASE) + +# '/usr/bin/time -f "%E real %Mk RSS %I in KB %O out KB"' +time_re = re.compile(r"(?P(\d+:)?\d+\.\d+)\s+real") +rss_re = re.compile(r"\s(?P\d+)k\s+RSS", re.IGNORECASE) + +def parse_real_to_seconds(s): + # formatos "0:28.40" ó "12.34" + if ":" in s: + m, sec = s.split(":") + return int(m) * 60 + float(sec) + return float(s) + +def parse_log(path): + max_speed = None + real_s = None + rss_kb = None + + with open(path, "r", errors="ignore") as f: + for line in f: + m = speed_re.search(line) + if m: + v = float(m.group(1)) + if (max_speed is None) or (v > max_speed): + max_speed = v + + f.seek(0) + data = f.read() + + tm = time_re.search(data) + if tm: + real_s = parse_real_to_seconds(tm.group("real")) + rm = rss_re.search(data) + if rm: + rss_kb = int(rm.group("rss")) + + return max_speed, real_s, rss_kb + +def pick(val_list, func): + vals = [v for v in val_list if v is not None] + if not vals: return "" + return func(vals) + +def main(): + if len(sys.argv) < 2: + print("Uso: summarize_bench.py [pattern]", file=sys.stderr) + sys.exit(1) + logdir = sys.argv[1] + pattern = sys.argv[2] if len(sys.argv) >= 3 else "*.log" + + files = glob.glob(os.path.join(logdir, pattern)) + if not files: + print("No hay logs en", logdir, file=sys.stderr) + sys.exit(2) + + # nombres: _dp_tb_tr_runX.log + import re + name_re = re.compile(r".*?_dp(?P\d+)_tb(?P\d+)_tr(?P\d+)_run(?P\d+)\.log$") + + groups = {} + for p in sorted(files): + m = name_re.match(os.path.basename(p)) + if not m: + continue + dp = int(m.group("dp")); tb = int(m.group("tb")); tr = int(m.group("tr")) + metrics = parse_log(p) + groups.setdefault((dp,tb,tr), []).append(metrics) + + print("dp,tame_bits,tame_ratio,runs,median_speed_MKeys,max_speed_MKeys,median_real_s,median_rss_kb") + + for (dp,tb,tr), lst in sorted(groups.items()): + speeds = [x[0] for x in lst if x[0] is not None] + reals = [x[1] for x in lst if x[1] is not None] + rsslist = [x[2] for x in lst if x[2] is not None] + + med_speed = pick(speeds, statistics.median) + max_speed = pick(speeds, max) + med_real = pick(reals, statistics.median) + med_rss = pick(rsslist, statistics.median) + + def fmt(x): + if x == "": return "" + if isinstance(x, float): + return f"{x:.3f}" + return str(x) + + print(f"{dp},{tb},{tr},{len(lst)},{fmt(med_speed)},{fmt(max_speed)},{fmt(med_real)},{fmt(med_rss)}") + +if __name__ == '__main__': + main() diff --git a/summary_j1.csv b/summary_j1.csv new file mode 100644 index 0000000..d1d0104 --- /dev/null +++ b/summary_j1.csv @@ -0,0 +1,19 @@ +dp,tame_bits,tame_ratio,runs,median_speed_MKeys,max_speed_MKeys,median_real_s,median_rss_kb +14,4,25,5,755.000,784.000,66.040,745940 +14,4,33,5,740.000,744.000,100.080,821616 +14,4,40,5,740.000,753.000,85.440,787120 +14,5,25,5,753.000,755.000,93.070,806608 +14,5,33,5,759.000,761.000,60.130,728480 +14,5,40,5,760.000,762.000,91.140,805548 +15,4,25,5,763.000,765.000,88.580,687048 +15,4,33,5,755.000,759.000,56.670,645252 +15,4,40,5,760.000,764.000,66.400,657748 +15,5,25,5,749.000,765.000,54.610,641992 +15,5,33,5,736.000,740.000,121.550,725884 +15,5,40,5,736.000,738.000,52.380,638124 +16,4,25,5,735.000,736.000,98.060,633456 +16,4,33,5,731.000,733.000,82.430,623392 +16,4,40,5,732.000,733.000,60.760,609024 +16,5,25,5,736.000,751.000,80.850,622144 +16,5,33,5,762.000,763.000,81.740,624944 +16,5,40,5,769.000,778.000,109.390,644316 diff --git a/utils.cpp b/utils.cpp index 6665a75..b1a660e 100644 --- a/utils.cpp +++ b/utils.cpp @@ -1,312 +1,403 @@ -// This file is a part of RCKangaroo software -// (c) 2024, RetiredCoder (RC) -// License: GPLv3, see "LICENSE.TXT" file -// https://github.com/RetiredC - - -#include "utils.h" -#include - -#ifdef _WIN32 - -#else - -void _BitScanReverse64(u32* index, u64 msk) -{ - *index = 63 - __builtin_clzll(msk); -} - -void _BitScanForward64(u32* index, u64 msk) -{ - *index = __builtin_ffsll(msk) - 1; -} - -u64 _umul128(u64 m1, u64 m2, u64* hi) -{ - uint128_t ab = (uint128_t)m1 * m2; *hi = (u64)(ab >> 64); return (u64)ab; -} - -u64 __shiftright128 (u64 LowPart, u64 HighPart, u8 Shift) -{ - u64 ret; - __asm__ ("shrd {%[Shift],%[HighPart],%[LowPart]|%[LowPart], %[HighPart], %[Shift]}" - : [ret] "=r" (ret) - : [LowPart] "0" (LowPart), [HighPart] "r" (HighPart), [Shift] "Jc" (Shift) - : "cc"); - return ret; -} - -u64 __shiftleft128 (u64 LowPart, u64 HighPart, u8 Shift) -{ - u64 ret; - __asm__ ("shld {%[Shift],%[LowPart],%[HighPart]|%[HighPart], %[LowPart], %[Shift]}" - : [ret] "=r" (ret) - : [LowPart] "r" (LowPart), [HighPart] "0" (HighPart), [Shift] "Jc" (Shift) - : "cc"); - return ret; -} - -u64 GetTickCount64() -{ - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC_RAW, &ts); - return (u64)(ts.tv_nsec / 1000000) + ((u64)ts.tv_sec * 1000ull); -} -#endif - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#define DB_REC_LEN 32 -#define DB_FIND_LEN 9 -#define DB_MIN_GROW_CNT 2 - -//we need advanced memory management to reduce memory fragmentation -//everything will be stable up to about 8TB RAM - -#define MEM_PAGE_SIZE (128 * 1024) -#define RECS_IN_PAGE (MEM_PAGE_SIZE / DB_REC_LEN) -#define MAX_PAGES_CNT (0xFFFFFFFF / RECS_IN_PAGE) - -MemPool::MemPool() -{ - pnt = 0; -} - -MemPool::~MemPool() -{ - Clear(); -} - -void MemPool::Clear() -{ - int cnt = (int)pages.size(); - for (int i = 0; i < cnt; i++) - free(pages[i]); - pages.clear(); - pnt = 0; -} - -void* MemPool::AllocRec(u32* cmp_ptr) -{ - void* mem; - if (pages.empty() || (pnt + DB_REC_LEN > MEM_PAGE_SIZE)) - { - if (pages.size() >= MAX_PAGES_CNT) - return NULL; //overflow - pages.push_back(malloc(MEM_PAGE_SIZE)); - pnt = 0; - } - u32 page_ind = (u32)pages.size() - 1; - mem = (u8*)pages[page_ind] + pnt; - *cmp_ptr = (page_ind * RECS_IN_PAGE) | (pnt / DB_REC_LEN); - pnt += DB_REC_LEN; - return mem; -} - -void* MemPool::GetRecPtr(u32 cmp_ptr) -{ - u32 page_ind = cmp_ptr / RECS_IN_PAGE; - u32 rec_ind = cmp_ptr % RECS_IN_PAGE; - return (u8*)pages[page_ind] + DB_REC_LEN * rec_ind; -} - -TFastBase::TFastBase() -{ - memset(lists, 0, sizeof(lists)); - memset(Header, 0, sizeof(Header)); -} - -TFastBase::~TFastBase() -{ - Clear(); -} - -void TFastBase::Clear() -{ - for (int i = 0; i < 256; i++) - { - for (int j = 0; j < 256; j++) - for (int k = 0; k < 256; k++) - { - if (lists[i][j][k].data) - free(lists[i][j][k].data); - lists[i][j][k].data = NULL; - lists[i][j][k].capacity = 0; - lists[i][j][k].cnt = 0; - } - mps[i].Clear(); - } -} - -u64 TFastBase::GetBlockCnt() -{ - u64 blockCount = 0; - for (int i = 0; i < 256; i++) - for (int j = 0; j < 256; j++) - for (int k = 0; k < 256; k++) - blockCount += lists[i][j][k].cnt; - return blockCount; -} - -// http://en.cppreference.com/w/cpp/algorithm/lower_bound -int TFastBase::lower_bound(TListRec* list, int mps_ind, u8* data) -{ - int count = list->cnt; - int it, first, step; - first = 0; - while (count > 0) - { - it = first; - step = count / 2; - it += step; - void* ptr = mps[mps_ind].GetRecPtr(list->data[it]); - if (memcmp(ptr, data, DB_FIND_LEN) < 0) - { - first = ++it; - count -= step + 1; - } - else - count = step; - } - return first; -} - -u8* TFastBase::AddDataBlock(u8* data, int pos) -{ - TListRec* list = &lists[data[0]][data[1]][data[2]]; - if (list->cnt >= list->capacity) - { - u32 grow = list->capacity / 2; - if (grow < DB_MIN_GROW_CNT) - grow = DB_MIN_GROW_CNT; - u32 newcap = list->capacity + grow; - if (newcap > 0xFFFF) - newcap = 0xFFFF; - if (newcap <= list->capacity) - return NULL; //failed - list->data = (u32*)realloc(list->data, newcap * sizeof(u32)); - list->capacity = newcap; - } - int first = (pos < 0) ? lower_bound(list, data[0], data + 3) : pos; - memmove(list->data + first + 1, list->data + first, (list->cnt - first) * sizeof(u32)); - u32 cmp_ptr; - void* ptr = mps[data[0]].AllocRec(&cmp_ptr); - list->data[first] = cmp_ptr; - memcpy(ptr, data + 3, DB_REC_LEN); - list->cnt++; - return (u8*)ptr; -} - -u8* TFastBase::FindDataBlock(u8* data) -{ - bool res = false; - TListRec* list = &lists[data[0]][data[1]][data[2]]; - int first = lower_bound(list, data[0], data + 3); - if (first == list->cnt) - return NULL; - void* ptr = mps[data[0]].GetRecPtr(list->data[first]); - if (memcmp(ptr, data + 3, DB_FIND_LEN)) - return NULL; - return (u8*)ptr; -} - -u8* TFastBase::FindOrAddDataBlock(u8* data) -{ - void* ptr; - TListRec* list = &lists[data[0]][data[1]][data[2]]; - int first = lower_bound(list, data[0], data + 3); - if (first == list->cnt) - goto label_not_found; - ptr = mps[data[0]].GetRecPtr(list->data[first]); - if (memcmp(ptr, data + 3, DB_FIND_LEN)) - goto label_not_found; - return (u8*)ptr; -label_not_found: - AddDataBlock(data, first); - return NULL; -} - -//slow but I hope you are not going to create huge DB with this proof-of-concept software -bool TFastBase::LoadFromFile(char* fn) -{ - Clear(); - FILE* fp = fopen(fn, "rb"); - if (!fp) - return false; - if (fread(Header, 1, sizeof(Header), fp) != sizeof(Header)) - { - fclose(fp); - return false; - } - for (int i = 0; i < 256; i++) - for (int j = 0; j < 256; j++) - for (int k = 0; k < 256; k++) - { - TListRec* list = &lists[i][j][k]; - fread(&list->cnt, 1, 2, fp); - if (list->cnt) - { - u32 grow = list->cnt / 2; - if (grow < DB_MIN_GROW_CNT) - grow = DB_MIN_GROW_CNT; - u32 newcap = list->cnt + grow; - if (newcap > 0xFFFF) - newcap = 0xFFFF; - list->data = (u32*)realloc(list->data, newcap * sizeof(u32)); - list->capacity = newcap; - - for (int m = 0; m < list->cnt; m++) - { - u32 cmp_ptr; - void* ptr = mps[i].AllocRec(&cmp_ptr); - list->data[m] = cmp_ptr; - if (fread(ptr, 1, DB_REC_LEN, fp) != DB_REC_LEN) - { - fclose(fp); - return false; - } - } - } - } - fclose(fp); - return true; -} - -bool TFastBase::SaveToFile(char* fn) -{ - FILE* fp = fopen(fn, "wb"); - if (!fp) - return false; - if (fwrite(Header, 1, sizeof(Header), fp) != sizeof(Header)) - { - fclose(fp); - return false; - } - for (int i = 0; i < 256; i++) - for (int j = 0; j < 256; j++) - for (int k = 0; k < 256; k++) - { - TListRec* list = &lists[i][j][k]; - fwrite(&list->cnt, 1, 2, fp); - for (int m = 0; m < list->cnt; m++) - { - void* ptr = mps[i].GetRecPtr(list->data[m]); - if (fwrite(ptr, 1, DB_REC_LEN, fp) != DB_REC_LEN) - { - fclose(fp); - return false; - } - } - } - fclose(fp); - return true; -} - -bool IsFileExist(char* fn) -{ - FILE* fp = fopen(fn, "rb"); - if (!fp) - return false; - fclose(fp); - return true; +// This file is a part of RCKangaroo software +// (c) 2024, RetiredCoder (RC) +// License: GPLv3, see "LICENSE.TXT" file +// https://github.com/RetiredC + + +#include "utils.h" +#include +static const char kTamesV15Tag[8] = {'T','M','B','M','1','5','\0','\0'}; +static const char kTamesV16Tag[8] = {'T','M','B','M','1','6','\0','\0'}; + + +#ifndef DB_FIND_LEN +#define DB_FIND_LEN 5 +#endif +#ifndef DB_REC_LEN +#define DB_REC_LEN (DB_FIND_LEN + 22 + 1) +#endif +#ifdef _WIN32 + +#else + +void _BitScanReverse64(u32* index, u64 msk) +{ + *index = 63 - __builtin_clzll(msk); +} + +void _BitScanForward64(u32* index, u64 msk) +{ + *index = __builtin_ffsll(msk) - 1; +} + +u64 _umul128(u64 m1, u64 m2, u64* hi) +{ + uint128_t ab = (uint128_t)m1 * m2; *hi = (u64)(ab >> 64); return (u64)ab; +} + +u64 __shiftright128 (u64 LowPart, u64 HighPart, u8 Shift) +{ + u64 ret; + __asm__ ("shrd {%[Shift],%[HighPart],%[LowPart]|%[LowPart], %[HighPart], %[Shift]}" + : [ret] "=r" (ret) + : [LowPart] "0" (LowPart), [HighPart] "r" (HighPart), [Shift] "Jc" (Shift) + : "cc"); + return ret; +} + +u64 __shiftleft128 (u64 LowPart, u64 HighPart, u8 Shift) +{ + u64 ret; + __asm__ ("shld {%[Shift],%[LowPart],%[HighPart]|%[HighPart], %[LowPart], %[Shift]}" + : [ret] "=r" (ret) + : [LowPart] "r" (LowPart), [HighPart] "0" (HighPart), [Shift] "Jc" (Shift) + : "cc"); + return ret; +} + +u64 GetTickCount64() +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return (u64)(ts.tv_nsec / 1000000) + ((u64)ts.tv_sec * 1000ull); +} +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define DB_REC_LEN 32 +#define DB_FIND_LEN 9 +#define DB_MIN_GROW_CNT 2 + +//we need advanced memory management to reduce memory fragmentation +//everything will be stable up to about 8TB RAM + +#define MEM_PAGE_SIZE (128 * 1024) +#define RECS_IN_PAGE (MEM_PAGE_SIZE / DB_REC_LEN) +#define MAX_PAGES_CNT (0xFFFFFFFF / RECS_IN_PAGE) + +MemPool::MemPool() +{ + pnt = 0; +} + +MemPool::~MemPool() +{ + Clear(); +} + +void MemPool::Clear() +{ + int cnt = (int)pages.size(); + for (int i = 0; i < cnt; i++) + free(pages[i]); + pages.clear(); + pnt = 0; +} + +void* MemPool::AllocRec(u32* cmp_ptr) +{ + void* mem; + if (pages.empty() || (pnt + DB_REC_LEN > MEM_PAGE_SIZE)) + { + if (pages.size() >= MAX_PAGES_CNT) + return NULL; //overflow + pages.push_back(malloc(MEM_PAGE_SIZE)); + pnt = 0; + } + u32 page_ind = (u32)pages.size() - 1; + mem = (u8*)pages[page_ind] + pnt; + *cmp_ptr = (page_ind * RECS_IN_PAGE) | (pnt / DB_REC_LEN); + pnt += DB_REC_LEN; + return mem; +} + +void* MemPool::GetRecPtr(u32 cmp_ptr) +{ + u32 page_ind = cmp_ptr / RECS_IN_PAGE; + u32 rec_ind = cmp_ptr % RECS_IN_PAGE; + return (u8*)pages[page_ind] + DB_REC_LEN * rec_ind; +} + +TFastBase::TFastBase() +{ + memset(lists, 0, sizeof(lists)); + memset(Header, 0, sizeof(Header)); +} + +TFastBase::~TFastBase() +{ + Clear(); +} + +void TFastBase::Clear() +{ + for (int i = 0; i < 256; i++) + { + for (int j = 0; j < 256; j++) + for (int k = 0; k < 256; k++) + { + if (lists[i][j][k].data) + free(lists[i][j][k].data); + lists[i][j][k].data = NULL; + lists[i][j][k].capacity = 0; + lists[i][j][k].cnt = 0; + } + mps[i].Clear(); + } +} + +u64 TFastBase::GetBlockCnt() +{ + u64 blockCount = 0; + for (int i = 0; i < 256; i++) + for (int j = 0; j < 256; j++) + for (int k = 0; k < 256; k++) + blockCount += lists[i][j][k].cnt; + return blockCount; +} + +// http://en.cppreference.com/w/cpp/algorithm/lower_bound +int TFastBase::lower_bound(TListRec* list, int mps_ind, u8* data) +{ + int count = list->cnt; + int it, first, step; + first = 0; + while (count > 0) + { + it = first; + step = count / 2; + it += step; + void* ptr = mps[mps_ind].GetRecPtr(list->data[it]); + if (memcmp(ptr, data, DB_FIND_LEN) < 0) + { + first = ++it; + count -= step + 1; + } + else + count = step; + } + return first; +} + +u8* TFastBase::AddDataBlock(u8* data, int pos) +{ + TListRec* list = &lists[data[0]][data[1]][data[2]]; + if (list->cnt >= list->capacity) + { + u32 grow = list->capacity / 2; + if (grow < DB_MIN_GROW_CNT) + grow = DB_MIN_GROW_CNT; + u32 newcap = list->capacity + grow; + if (newcap > 0xFFFF) + newcap = 0xFFFF; + if (newcap <= list->capacity) + return NULL; //failed + list->data = (u32*)realloc(list->data, newcap * sizeof(u32)); + list->capacity = newcap; + } + int first = (pos < 0) ? lower_bound(list, data[0], data + 3) : pos; + memmove(list->data + first + 1, list->data + first, (list->cnt - first) * sizeof(u32)); + u32 cmp_ptr; + void* ptr = mps[data[0]].AllocRec(&cmp_ptr); + list->data[first] = cmp_ptr; + memcpy(ptr, data + 3, DB_REC_LEN); + list->cnt++; + return (u8*)ptr; +} + +u8* TFastBase::FindDataBlock(u8* data) +{ + bool res = false; + TListRec* list = &lists[data[0]][data[1]][data[2]]; + int first = lower_bound(list, data[0], data + 3); + if (first == list->cnt) + return NULL; + void* ptr = mps[data[0]].GetRecPtr(list->data[first]); + if (memcmp(ptr, data + 3, DB_FIND_LEN)) + return NULL; + return (u8*)ptr; +} + +u8* TFastBase::FindOrAddDataBlock(u8* data) +{ + void* ptr; + TListRec* list = &lists[data[0]][data[1]][data[2]]; + int first = lower_bound(list, data[0], data + 3); + if (first == list->cnt) + goto label_not_found; + ptr = mps[data[0]].GetRecPtr(list->data[first]); + if (memcmp(ptr, data + 3, DB_FIND_LEN)) + goto label_not_found; + return (u8*)ptr; +label_not_found: + AddDataBlock(data, first); + return NULL; +} + +//slow but I hope you are not going to create huge DB with this proof-of-concept software +bool TFastBase::LoadFromFile(char* fn) +{ +Clear(); + FILE* fp = fopen(fn, "rb"); + if (!fp) return false; + + if (fread(Header, 1, sizeof(Header), fp) != sizeof(Header)) { + fclose(fp); return false; + } + + long pos_after_header = ftell(fp); + char tag[8]; + size_t got = fread(tag, 1, sizeof(tag), fp); + bool use_v15 = (got == sizeof(tag) && memcmp(tag, kTamesV15Tag, 8) == 0); + bool use_v16 = (got == sizeof(tag) && memcmp(tag, kTamesV16Tag, 8) == 0); + + if (!use_v15 && !use_v16) { + // --- Formato legado --- + fseek(fp, pos_after_header, SEEK_SET); + + for (int i = 0; i < 256; i++) + for (int j = 0; j < 256; j++) + for (int k = 0; k < 256; k++) + { + TListRec* list = &lists[i][j][k]; + + if (fread(&list->cnt, 2, 1, fp) != 1) { fclose(fp); return false; } + + if (list->cnt) { + // asegurar capacidad para list->data (u32*), no vector + if (list->capacity < list->cnt) { + unsigned short newcap = list->cnt; + if (newcap < 16) newcap = 16; + if (newcap > 0xFFFF) newcap = 0xFFFF; + list->data = (u32*)realloc(list->data, (size_t)newcap * sizeof(u32)); + if (!list->data) { fclose(fp); return false; } + list->capacity = newcap; + } + + for (int m = 0; m < list->cnt; m++) { + u32 cmp_ptr; + void* ptr = mps[i].AllocRec(&cmp_ptr); + list->data[m] = cmp_ptr; + + if (fread(ptr, 1, DB_REC_LEN, fp) != DB_REC_LEN) { + fclose(fp); return false; + } + } + } + } + + fclose(fp); + return true; + } + + // --- V1.5: bitmap + bulk por (i,j) --- + // selecc. tamaño por versión + size_t rec_len = use_v16 ? (size_t)DB_REC_LEN : (size_t)32; + for (int i = 0; i < 256; i++) { + for (int j = 0; j < 256; j++) { + + unsigned char bitmap[32]; + if (fread(bitmap, 1, 32, fp) != 32) { fclose(fp); return false; } + + for (int k = 0; k < 256; k++) { + TListRec* list = &lists[i][j][k]; + + if (bitmap[k >> 3] & (1u << (k & 7))) { + unsigned short cnt16; + if (fread(&cnt16, 2, 1, fp) != 1) { fclose(fp); return false; } + + // asegurar capacidad (u32*) + if (list->capacity < cnt16) { + unsigned short newcap = cnt16; + if (newcap < 16) newcap = 16; + if (newcap > 0xFFFF) newcap = 0xFFFF; + list->data = (u32*)realloc(list->data, (size_t)newcap * sizeof(u32)); + if (!list->data) { fclose(fp); return false; } + list->capacity = newcap; + } + list->cnt = cnt16; + + size_t bytes = (size_t)cnt16 * rec_len; + if (bytes) { + std::vector buf; buf.resize(bytes); + if (fread(buf.data(), 1, bytes, fp) != bytes) { fclose(fp); return false; } + + for (int m = 0; m < cnt16; m++) { + u32 cmp_ptr; + void* dst = mps[i].AllocRec(&cmp_ptr); + list->data[m] = cmp_ptr; + memcpy(dst, &buf[(size_t)m * rec_len], rec_len); + } + } + } else { + list->cnt = 0; + // conservamos capacidad existente, no tocamos list->data + } + } + } + } + + fclose(fp); + return true; +} + + +bool TFastBase::SaveToFile(char* fn) +{ +FILE* fp = fopen(fn, "wb"); + if (!fp) return false; + + if (fwrite(Header, 1, sizeof(Header), fp) != sizeof(Header)) { + fclose(fp); return false; + } + + // --- TAG V1.5 --- + if (fwrite(kTamesV16Tag, 1, sizeof(kTamesV16Tag), fp) != sizeof(kTamesV16Tag)) { + fclose(fp); return false; + } + + size_t rec_len = (size_t)DB_REC_LEN; + // (i,j): bitmap de 256 bits + por cada k set: cnt (u16) y bloque contiguo de registros + for (int i = 0; i < 256; i++) { + for (int j = 0; j < 256; j++) { + + unsigned char bitmap[32]; memset(bitmap, 0, sizeof(bitmap)); + for (int k = 0; k < 256; k++) { + TListRec* list = &lists[i][j][k]; + if (list->cnt) bitmap[k >> 3] |= (1u << (k & 7)); + } + if (fwrite(bitmap, 1, 32, fp) != 32) { fclose(fp); return false; } + + for (int k = 0; k < 256; k++) if (bitmap[k >> 3] & (1u << (k & 7))) { + TListRec* list = &lists[i][j][k]; + unsigned short cnt16 = (unsigned short)list->cnt; + if (fwrite(&cnt16, 2, 1, fp) != 1) { fclose(fp); return false; } + + size_t bytes = (size_t)cnt16 * rec_len; + if (bytes) { + // Junta en buffer para un solo fwrite + std::vector buf; buf.resize(bytes); + for (int m = 0; m < cnt16; m++) { + void* ptr = mps[i].GetRecPtr(list->data[m]); + memcpy(&buf[(size_t)m * DB_REC_LEN], ptr, DB_REC_LEN); + } + if (fwrite(buf.data(), 1, bytes, fp) != bytes) { fclose(fp); return false; } + } + } + } + } + + fclose(fp); + return true; +} + + +bool IsFileExist(char* fn) +{ + FILE* fp = fopen(fn, "rb"); + if (!fp) + return false; + fclose(fp); + return true; } \ No newline at end of file