diff --git a/CHANGELOG_v16.md b/CHANGELOG_v16.md
new file mode 100644
index 0000000..1c724df
--- /dev/null
+++ b/CHANGELOG_v16.md
@@ -0,0 +1,58 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [1.6.0] - 2025-08-22
+### Added
+- **Warp‑aggregated atomics** for distinguished point (DP) emission in GPU kernels.
+- **Spanish README** updates mirroring English documentation.
+- **Backward compatible reader** for legacy `.dat` v1.5 files.
+
+### Changed
+- **New compact `.dat` format (v1.6 / `TMBM16`)**: DP record size reduced from 32B → **28B** (X tail 5B, distance 22B, type 1B).
+- **Improved memory coalescing** when writing DPs, enabling more efficient PCIe transfers.
+- **Documentation**: Updated `README.md` and `README_es.md` with a “What’s New in v1.6” section, benchmarks, and build tips.
+
+### Performance
+- Throughput improvements of **+10–30%** (GPU/`-dp` dependent). Example: RTX 3060 `-dp 16` ~**+16%** over v1.5.
+
+### Compatibility
+- v1.6 binaries **read** both `.dat` **v1.5** and **v1.6** formats.
+- v1.6 **writes** `.dat` in the new v1.6 format by default.
+
+### Migration
+- No action required for existing `.dat` v1.5 users; they continue to load.
+- New runs will generate `.dat` v1.6 files (smaller on disk).
+
+---
+
+## [1.5.0] - 2024-XX-XX
+- Initial public release by RetiredC.
+- GPU implementation of Pollard Kangaroo with DP infrastructure.
+
+---
+
+# Registro de cambios (ES)
+
+## [1.6.0] - 2025-08-22
+### Añadido
+- **Atómicas warp‑aggregadas** para la emisión de DPs en los kernels de GPU.
+- **README en español** actualizado en paralelo al inglés.
+- **Lectura retrocompatible** de archivos `.dat` v1.5.
+
+### Cambiado
+- **Nuevo formato `.dat` (v1.6 / `TMBM16`)**: registro DP de 32B → **28B** (cola de X 5B, distancia 22B, tipo 1B).
+- **Mejor coalescencia de memoria** al escribir DPs, con transferencias PCIe más eficientes.
+- **Documentación**: `README.md` y `README_es.md` incluyen “Novedades v1.6”, benchmarks y flags de compilación.
+
+### Rendimiento
+- Mejora de **+10–30%** (según GPU y `-dp`). Ejemplo: RTX 3060 `-dp 16` ~**+16%** vs v1.5.
+
+### Compatibilidad
+- Los binarios v1.6 **leen** archivos `.dat` **v1.5** y **v1.6**.
+- v1.6 **escribe** por defecto el nuevo formato `.dat` v1.6.
+
+### Migración
+- No se requiere acción para usuarios con `.dat` v1.5; siguen cargando.
+- Nuevas ejecuciones generarán `.dat` v1.6 (más pequeño en disco).
diff --git a/Ec.cpp b/Ec.cpp
index c598f39..84b4742 100644
--- a/Ec.cpp
+++ b/Ec.cpp
@@ -1,741 +1,829 @@
-// This file is a part of RCKangaroo software
-// (c) 2024, RetiredCoder (RC)
-// License: GPLv3, see "LICENSE.TXT" file
-// https://github.com/RetiredC
-
-
-#include "defs.h"
-#include "Ec.h"
-#include <random>
-#include "utils.h"
-
-// https://en.bitcoin.it/wiki/Secp256k1
-EcInt g_P; //FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F
-EcPoint g_G; //Generator point
-
-#define P_REV	0x00000001000003D1
-
-#ifdef DEBUG_MODE
-u8* GTable = NULL; //16x16-bit table
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-bool parse_u8(const char* s, u8* res)
-{
-	char cl = toupper(s[1]);
-	char ch = toupper(s[0]);
-	if (((cl < '0') || (cl > '9')) && ((cl < 'A') || (cl > 'F')))
-		return false;
-	if (((ch < '0') || (ch > '9')) && ((ch < 'A') || (ch > 'F')))
-		return false;
-	u8 l = ((cl >= '0') && (cl <= '9')) ? (cl - '0') : (cl - 'A' + 10);
-	u8 h = ((ch >= '0') && (ch <= '9')) ? (ch - '0') : (ch - 'A' + 10);
-	*res = l + (h << 4);
-	return true;
-}
-
-bool EcPoint::IsEqual(EcPoint& pnt)
-{
-	return this->x.IsEqual(pnt.x) && this->y.IsEqual(pnt.y);
-}
-
-void EcPoint::LoadFromBuffer64(u8* buffer)
-{
-	memcpy(x.data, buffer, 32);
-	x.data[4] = 0;
-	memcpy(y.data, buffer + 32, 32);
-	y.data[4] = 0;
-}
-
-void EcPoint::SaveToBuffer64(u8* buffer)
-{
-	memcpy(buffer, x.data, 32);
-	memcpy(buffer + 32, y.data, 32);
-}
-
-bool EcPoint::SetHexStr(const char* str)
-{
-	EcPoint res;
-	int len = (int)strlen(str);
-	if (len < 66)
-		return false;
-	u8 type, b;
-	if (!parse_u8(str, &type))
-		return false;
-	if ((type < 2) || (type > 4))
-		return false;
-	if (((type == 2) || (type == 3)) && (len != 66))
-		return false;
-	if ((type == 4) && (len != 130))
-		return false;
-
-	if (len == 66) //compressed
-	{
-		str += 2;
-		for (int i = 0; i < 32; i++)
-		{
-			if (!parse_u8(str + 2 * i, &b))
-				return false;
-			((u8*)res.x.data)[31 - i] = b;
-		}
-		res.y = Ec::CalcY(res.x, type == 2);
-		if (!Ec::IsValidPoint(res))
-			return false;		
-		*this = res;
-		return true;
-	}
-	//uncompressed
-	str += 2;
-	for (int i = 0; i < 32; i++)
-	{
-		if (!parse_u8(str + 2 * i, &b))
-			return false;
-		((u8*)res.x.data)[31 - i] = b;
-
-		if (!parse_u8(str + 2 * i + 64, &b))
-			return false;
-		((u8*)res.y.data)[31 - i] = b;
-	}
-	if (!Ec::IsValidPoint(res))
-		return false;
-	*this = res;
-	return true;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// https://en.bitcoin.it/wiki/Secp256k1
-void InitEc()
-{
-	g_P.SetHexStr("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F"); //Fp
-	g_G.x.SetHexStr("79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798"); //G.x
-	g_G.y.SetHexStr("483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8"); //G.y
-#ifdef DEBUG_MODE
-	GTable = (u8*)malloc(16 * 256 * 256 * 64);
-	EcPoint pnt = g_G;
-	for (int i = 0; i < 16; i++)
-	{
-		pnt.SaveToBuffer64(GTable + (i * 256 * 256) * 64);
-		EcPoint tmp = pnt;
-		pnt = Ec::DoublePoint(pnt);
-		for (int j = 1; j < 256 * 256 - 1; j++)
-		{
-			pnt.SaveToBuffer64(GTable + (i * 256 * 256 + j) * 64);
-			pnt = Ec::AddPoints(pnt, tmp);
-		}
-	}
-#endif
-};
-
-void DeInitEc()
-{
-#ifdef DEBUG_MODE
-	if (GTable)
-		free(GTable);
-#endif
-}
-
-// https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_addition
-EcPoint Ec::AddPoints(EcPoint& pnt1, EcPoint& pnt2)
-{
-	EcPoint res;
-	EcInt dx, dy, lambda, lambda2;
-
-	dx = pnt2.x;
-	dx.SubModP(pnt1.x);
-	dx.InvModP();
-
-	dy = pnt2.y;
-	dy.SubModP(pnt1.y);
-
-	lambda = dy;
-	lambda.MulModP(dx);
-	lambda2 = lambda;
-	lambda2.MulModP(lambda);
-
-	res.x = lambda2;
-	res.x.SubModP(pnt1.x);
-	res.x.SubModP(pnt2.x);
-
-	res.y = pnt2.x;
-	res.y.SubModP(res.x);
-	res.y.MulModP(lambda);
-	res.y.SubModP(pnt2.y);
-	return res;
-}
-
-// https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_doubling
-EcPoint Ec::DoublePoint(EcPoint& pnt)
-{
-	EcPoint res;
-	EcInt t1, t2, lambda, lambda2;
-
-	t1 = pnt.y;
-	t1.AddModP(pnt.y);
-	t1.InvModP();
-
-	t2 = pnt.x;
-	t2.MulModP(pnt.x);
-	lambda = t2;
-	lambda.AddModP(t2);
-	lambda.AddModP(t2);
-	lambda.MulModP(t1);
-	lambda2 = lambda;
-	lambda2.MulModP(lambda);
-
-	res.x = lambda2;
-	res.x.SubModP(pnt.x);
-	res.x.SubModP(pnt.x);
-
-	res.y = pnt.x;
-	res.y.SubModP(res.x);
-	res.y.MulModP(lambda);
-	res.y.SubModP(pnt.y);
-	return res;
-}
-
-//k up to 256 bits
-EcPoint Ec::MultiplyG(EcInt& k)
-{
-	EcPoint res;
-	EcPoint t = g_G;
-	bool first = true;
-	int n = 3;
-	while ((n >= 0) && !k.data[n])
-		n--;
-	if (n < 0)
-		return res; //error
-	int index;                     
-	_BitScanReverse64((DWORD*)&index, k.data[n]);
-	for (int i = 0; i <= 64 * n + index; i++)
-	{
-		u8 v = (k.data[i / 64] >> (i % 64)) & 1;	
-		if (v)
-		{
-			if (first)
-			{
-				first = false;
-				res = t;
-			}
-			else
-				res = Ec::AddPoints(res, t);
-		}
-		t = Ec::DoublePoint(t);
-	}
-	return res;
-}
-
-#ifdef DEBUG_MODE
-//uses gTable (16x16-bit) to speedup calculation
-EcPoint Ec::MultiplyG_Fast(EcInt& k)
-{
-	int i;
-	u16 b;
-	EcPoint pnt, res;
-	for (i = 0; i < 16; i++)
-	{
-		b = k.GetU16(i);
-		if (b)
-			break;
-	}
-	if (i >= 16)
-		return pnt;
-	if (i < 16)
-	{
-		res.LoadFromBuffer64(GTable + (256 * 256 * i + (b - 1)) * 64);
-		i++;
-	}
-	while (i < 16)
-	{
-		b = k.GetU16(i);
-		if (b)
-		{
-			pnt.LoadFromBuffer64(GTable + (256 * 256 * i + (b - 1)) * 64);
-			res = AddPoints(res, pnt);
-		}
-		i++;
-	}
-	return res;
-}
-#endif
-
-EcInt Ec::CalcY(EcInt& x, bool is_even)
-{
-	EcInt res;
-	EcInt tmp;
-	tmp.Set(7);
-	res = x;
-	res.MulModP(x);
-	res.MulModP(x);
-	res.AddModP(tmp);
-	res.SqrtModP();
-	if ((res.data[0] & 1) == is_even)
-		res.NegModP();
-	return res;
-}
-
-bool Ec::IsValidPoint(EcPoint& pnt)
-{
-	EcInt x, y, seven;
-	seven.Set(7);
-	x = pnt.x;
-	x.MulModP(pnt.x);
-	x.MulModP(pnt.x);
-	x.AddModP(seven);
-	y = pnt.y;
-	y.MulModP(pnt.y);
-	return x.IsEqual(y);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void Mul256_by_64(u64* input, u64 multiplier, u64* result)
-{
-	u64 h1, h2;
-	result[0] = _umul128(input[0], multiplier, &h1);
-	u8 carry = _addcarry_u64(0, _umul128(input[1], multiplier, &h2), h1, result + 1);
-	carry = _addcarry_u64(carry, _umul128(input[2], multiplier, &h1), h2, result + 2);
-	carry = _addcarry_u64(carry, _umul128(input[3], multiplier, &h2), h1, result + 3);
-	_addcarry_u64(carry, 0, h2, result + 4);
-}
-
-void Mul320_by_64(u64* input, u64 multiplier, u64* result)
-{
-	u64 h1, h2;
-	result[0] = _umul128(input[0], multiplier, &h1);
-	u8 carry = _addcarry_u64(0, _umul128(input[1], multiplier, &h2), h1, result + 1);
-	carry = _addcarry_u64(carry, _umul128(input[2], multiplier, &h1), h2, result + 2);
-	carry = _addcarry_u64(carry, _umul128(input[3], multiplier, &h2), h1, result + 3);
-	_addcarry_u64(carry, _umul128(input[4], multiplier, &h1), h2, result + 4);
-}
-
-void Add320_to_256(u64* in_out, u64* val)
-{
-	u8 c = _addcarry_u64(0, in_out[0], val[0], in_out);
-	c = _addcarry_u64(c, in_out[1], val[1], in_out + 1);
-	c = _addcarry_u64(c, in_out[2], val[2], in_out + 2);
-	c = _addcarry_u64(c, in_out[3], val[3], in_out + 3);
-	_addcarry_u64(c, 0, val[4], in_out + 4);
-}
-
-EcInt::EcInt()
-{
-	SetZero();
-}
-
-void EcInt::Assign(EcInt& val)
-{
-	memcpy(data, val.data, sizeof(data));
-}
-
-void EcInt::Set(u64 val)
-{
-	SetZero();
-	data[0] = val;
-}
-
-void EcInt::SetZero()
-{
-	memset(data, 0, sizeof(data));
-}
-
-bool EcInt::SetHexStr(const char* str)
-{
-	SetZero();
-	int len = (int)strlen(str);
-	if (len > 64)
-		return false;
-	char s[64];
-	memset(s, '0', 64);
-	memcpy(s + 64 - len, str, len);
-	for (int i = 0; i < 32; i++)
-	{
-		int n = 62 - 2 * i;
-		u8 b;
-		if (!parse_u8(s + n, &b))
-			return false;
-		((u8*)data)[i] = b;
-	}
-	return true;
-}
-
-void EcInt::GetHexStr(char* str)
-{
-	for (int i = 0; i < 32; i++)
-		sprintf(str + 2 * i, "%02X", ((u8*)data)[31 - i]);
-	str[64] = 0;
-}
-
-u16 EcInt::GetU16(int index)
-{
-	return (u16)(data[index / 4] >> (16 * (index % 4)));
-}
-
-//returns carry
-bool EcInt::Add(EcInt& val)
-{
-	u8 c = _addcarry_u64(0, data[0], val.data[0], data + 0);
-	c = _addcarry_u64(c, data[1], val.data[1], data + 1);
-	c = _addcarry_u64(c, data[2], val.data[2], data + 2);
-	c = _addcarry_u64(c, data[3], val.data[3], data + 3);
-	return _addcarry_u64(c, data[4], val.data[4], data + 4) != 0;
-}
-
-//returns carry
-bool EcInt::Sub(EcInt& val)
-{
-	u8 c = _subborrow_u64(0, data[0], val.data[0], data + 0);
-	c = _subborrow_u64(c, data[1], val.data[1], data + 1);
-	c = _subborrow_u64(c, data[2], val.data[2], data + 2);
-	c = _subborrow_u64(c, data[3], val.data[3], data + 3);
-	return _subborrow_u64(c, data[4], val.data[4], data + 4) != 0;
-}
-
-void EcInt::Neg()
-{
-	u8 c = _subborrow_u64(0, 0, data[0], data + 0);
-	c = _subborrow_u64(c, 0, data[1], data + 1);
-	c = _subborrow_u64(c, 0, data[2], data + 2);
-	c = _subborrow_u64(c, 0, data[3], data + 3);
-	_subborrow_u64(c, 0, data[4], data + 4);
-}
-
-void EcInt::Neg256()
-{
-	u8 c = _subborrow_u64(0, 0, data[0], data + 0);
-	c = _subborrow_u64(c, 0, data[1], data + 1);
-	c = _subborrow_u64(c, 0, data[2], data + 2);
-	c = _subborrow_u64(c, 0, data[3], data + 3);
-	data[4] = 0;
-}
-
-bool EcInt::IsLessThanU(EcInt& val)
-{
-	int i = 4;
-	while (i >= 0)
-	{
-		if (data[i] != val.data[i])
-			break;
-		i--;
-	}
-	if (i < 0)
-		return false;
-	return data[i] < val.data[i];
-}
-
-bool EcInt::IsLessThanI(EcInt& val)
-{
-	if ((data[4] >> 63) && !(val.data[4] >> 63))
-		return true;
-	if (!(data[4] >> 63) && (val.data[4] >> 63))
-		return false;
-
-	int i = 4;
-	while (i >= 0)
-	{
-		if (data[i] != val.data[i])
-			break;
-		i--;
-	}
-	if (i < 0)
-		return false;
-	return data[i] < val.data[i];
-}
-
-bool EcInt::IsEqual(EcInt& val)
-{
-	return memcmp(val.data, this->data, 40) == 0;
-}
-
-bool EcInt::IsZero()
-{
-	return ((data[0] == 0) && (data[1] == 0) && (data[2] == 0) && (data[3] == 0) && (data[4] == 0));
-}
-
-void EcInt::AddModP(EcInt& val)
-{
-	Add(val);
-	if (!IsLessThanU(g_P)) 
-		Sub(g_P);
-}
-
-void EcInt::SubModP(EcInt& val)
-{
-	if (Sub(val))
-		Add(g_P);
-}
-
-//assume value < P
-void EcInt::NegModP()
-{
-	Neg();
-	Add(g_P);
-}
-
-void EcInt::ShiftRight(int nbits)
-{
-	int offset = nbits / 64;
-	if (offset)
-	{
-		for (int i = 0; i < 5 - offset; i++)
-			data[i] = data[i + offset];
-		for (int i = 5 - offset; i < 5; i++)
-			data[i] = 0;
-		nbits -= 64 * offset;
-	}
-	data[0] = __shiftright128(data[0], data[1], nbits);
-	data[1] = __shiftright128(data[1], data[2], nbits);
-	data[2] = __shiftright128(data[2], data[3], nbits);
-	data[3] = __shiftright128(data[3], data[4], nbits);
-	data[4] = ((i64)data[4]) >> nbits;
-}
-
-void EcInt::ShiftLeft(int nbits)
-{
-	int offset = nbits / 64;
-	if (offset)
-	{
-		for (int i = 4; i >= offset; i--)
-			data[i] = data[i - offset];
-		for (int i = offset - 1; i >= 0; i--)
-			data[i] = 0;
-		nbits -= 64 * offset;
-	}
-	data[4] = __shiftleft128(data[3], data[4], nbits);
-	data[3] = __shiftleft128(data[2], data[3], nbits);
-	data[2] = __shiftleft128(data[1], data[2], nbits);
-	data[1] = __shiftleft128(data[0], data[1], nbits);
-	data[0] = data[0] << nbits;
-}
-
-void EcInt::MulModP(EcInt& val)
-{	
-	u64 buff[8], tmp[5], h;
-	//calc 512 bits
-	Mul256_by_64(val.data, data[0], buff);
-	Mul256_by_64(val.data, data[1], tmp);
-	Add320_to_256(buff + 1, tmp);
-	Mul256_by_64(val.data, data[2], tmp);
-	Add320_to_256(buff + 2, tmp);
-	Mul256_by_64(val.data, data[3], tmp);
-	Add320_to_256(buff + 3, tmp);
-	//fast mod P
-	Mul256_by_64(buff + 4, P_REV, tmp);
-	u8 c = _addcarry_u64(0, buff[0], tmp[0], buff);
-	c = _addcarry_u64(c, buff[1], tmp[1], buff + 1);
-	c = _addcarry_u64(c, buff[2], tmp[2], buff + 2);
-	tmp[4] += _addcarry_u64(c, buff[3], tmp[3], buff + 3);
-	c = _addcarry_u64(0, buff[0], _umul128(tmp[4], P_REV, &h), data);
-	c = _addcarry_u64(c, buff[1], h, data + 1);
-	c = _addcarry_u64(c, 0, buff[2], data + 2);
-	data[4] = _addcarry_u64(c, buff[3], 0, data + 3);
-	while (data[4])
-		Sub(g_P);
-}
-
-void EcInt::Mul_u64(EcInt& val, u64 multiplier)
-{
-	Assign(val);
-	Mul320_by_64(data, (u64)multiplier, data);
-}
-
-void EcInt::Mul_i64(EcInt& val, i64 multiplier)
-{
-	Assign(val);
-	if (multiplier < 0)
-	{
-		Neg();
-		multiplier = -multiplier;
-	}
-	Mul320_by_64(data, (u64)multiplier, data);
-}
-
-#define APPLY_DIV_SHIFT() kbnt -= index; val >>= index; matrix[0] <<= index; matrix[1] <<= index; 
-	
-// https://tches.iacr.org/index.php/TCHES/article/download/8298/7648/4494
-//a bit tricky
-void DIV_62(i64& kbnt, i64 modp, i64 val, i64* matrix)
-{
-	int index, cnt;
-	_BitScanForward64((DWORD*)&index, val | 0x4000000000000000);
-	APPLY_DIV_SHIFT();
-	cnt = 62 - index;
-	while (cnt > 0)
-	{
-		if (kbnt < 0)
-		{
-			kbnt = -kbnt;
-			i64 tmp = -modp; modp = val; val = tmp;
-			tmp = -matrix[0]; matrix[0] = matrix[2]; matrix[2] = tmp;
-			tmp = -matrix[1]; matrix[1] = matrix[3]; matrix[3] = tmp;
-		}
-		int thr = cnt;
-		if ((kbnt + 1) < cnt)
-			thr = (int)(kbnt + 1);
-		i64 mul = (-modp * val) & ((UINT64_MAX >> (64 - thr)) & 0x07);
-		val += (modp * mul);
-		matrix[2] += (matrix[0] * mul);
-		matrix[3] += (matrix[1] * mul);
-		_BitScanForward64((DWORD*)&index, val | (1ull << cnt));
-		APPLY_DIV_SHIFT();
-		cnt -= index;
-	}
-}
-
-void EcInt::InvModP()
-{
-	i64 matrix[4];
-	EcInt result, a, tmp, tmp2;
-	EcInt modp, val;
-	i64 kbnt = -1;
-	matrix[1] = matrix[2] = 0;
-	matrix[0] = matrix[3] = 1;	
-	DIV_62(kbnt, g_P.data[0], data[0], matrix);
-	modp.Mul_i64(g_P, matrix[0]);
-	tmp.Mul_i64(*this, matrix[1]);
-	modp.Add(tmp);
-	modp.ShiftRight(62);
-	val.Mul_i64(g_P, matrix[2]);
-	tmp.Mul_i64(*this, matrix[3]);
-	val.Add(tmp);
-	val.ShiftRight(62);
-	if (matrix[1] >= 0)
-		result.Set(matrix[1]);
-	else
-	{
-		result.Set(-matrix[1]);
-		result.Neg();
-	}
-	if (matrix[3] >= 0)
-		a.Set(matrix[3]);
-	else
-	{ 
-		a.Set(-matrix[3]);
-		a.Neg();
-	}
-	Mul320_by_64(g_P.data, (result.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp.data);
-	result.Add(tmp);
-	result.ShiftRight(62);
-	Mul320_by_64(g_P.data, (a.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp.data);
-	a.Add(tmp);
-	a.ShiftRight(62);
-	
-	while (val.data[0] || val.data[1] || val.data[2] || val.data[3])
-	{
-		matrix[1] = matrix[2] = 0;
-		matrix[0] = matrix[3] = 1;	
-		DIV_62(kbnt, modp.data[0], val.data[0], matrix);
-		tmp.Mul_i64(modp, matrix[0]);
-		tmp2.Mul_i64(val, matrix[1]);
-		tmp.Add(tmp2);
-		tmp2.Mul_i64(val, matrix[3]);
-		val.Mul_i64(modp, matrix[2]);
-		val.Add(tmp2);
-		val.ShiftRight(62);
-		modp = tmp;
-		modp.ShiftRight(62);
-		tmp.Mul_i64(result, matrix[0]);
-		tmp2.Mul_i64(a, matrix[1]);
-		tmp.Add(tmp2);
-		tmp2.Mul_i64(a, matrix[3]);
-		a.Mul_i64(result, matrix[2]);
-		a.Add(tmp2);
-		Mul320_by_64(g_P.data, (a.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp2.data);
-		a.Add(tmp2);
-		a.ShiftRight(62);	
-		Mul320_by_64(g_P.data, (tmp.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp2.data);
-		result = tmp;
-		result.Add(tmp2);
-		result.ShiftRight(62);
-	}
-	Assign(result);
-	if (modp.data[4] >> 63)
-	{
-		Neg();
-		modp.Neg();	
-	}
-
-	if (modp.data[0] == 1) 
-	{
-		if (data[4] >> 63)
-			Add(g_P);
-		if (data[4] >> 63)
-			Add(g_P);
-		if (!IsLessThanU(g_P))
-			Sub(g_P);
-		if (!IsLessThanU(g_P))
-			Sub(g_P);
-	}
-	else
-		SetZero(); //error
-}
-
-// x = a^ { (p + 1) / 4 } mod p
-void EcInt::SqrtModP()
-{
-	EcInt one, res;
-	one.Set(1);
-	EcInt exp = g_P;
-	exp.Add(one);
-	exp.ShiftRight(2);
-	res.Set(1);
-	EcInt cur = *this;
-	while (!exp.IsZero())
-	{
-		if (exp.data[0] & 1)
-			res.MulModP(cur);
-		EcInt tmp = cur;
-		tmp.MulModP(cur);
-		cur = tmp;
-		exp.ShiftRight(1);
-	}
-	*this = res;
-}
-
-std::mt19937_64 rng;
-CriticalSection cs_rnd;
-
-void SetRndSeed(u64 seed)
-{
-	rng.seed(seed);
-}
-
-void EcInt::RndBits(int nbits)
-{
-	SetZero();
-	if (nbits > 256)
-		nbits = 256;
-	cs_rnd.Enter();
-	for (int i = 0; i < (nbits + 63) / 64; i++)
-		data[i] = rng();
-	cs_rnd.Leave();
-	data[nbits / 64] &= (1ull << (nbits % 64)) - 1;
-}
-
-//up to 256 bits only
-void EcInt::RndMax(EcInt& max)
-{
-	SetZero();
-	int n = 3;
-	while ((n >= 0) && !max.data[n])
-		n--;
-	if (n < 0)
-		return;
-	u64 val = max.data[n];
-	int k = 0;
-	while ((val & 0x8000000000000000) == 0)
-	{
-		val <<= 1;
-		k++;
-	}
-	int bits = 64 * n + (64 - k);
-	RndBits(bits);
-	while (!IsLessThanU(max)) // :)
-		RndBits(bits);
-}
-
-
-
-
-
+// This file is a part of RCKangaroo software
+// (c) 2024, RetiredCoder (RC)
+// License: GPLv3, see "LICENSE.TXT" file
+// https://github.com/RetiredC
+
+
+#include "defs.h"
+#include "Ec.h"
+#include <random>
+#include "utils.h"
+
+// https://en.bitcoin.it/wiki/Secp256k1
+EcInt g_P; //FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F
+EcInt g_N; //FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE BAAEDCE6 AF48A03B BFD25E8C D0364141
+EcPoint g_G; //Generator point
+
+#define P_REV	0x00000001000003D1
+
+#ifdef DEBUG_MODE
+u8* GTable = NULL; //16x16-bit table
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+bool parse_u8(const char* s, u8* res)
+{
+	char cl = toupper(s[1]);
+	char ch = toupper(s[0]);
+	if (((cl < '0') || (cl > '9')) && ((cl < 'A') || (cl > 'F')))
+		return false;
+	if (((ch < '0') || (ch > '9')) && ((ch < 'A') || (ch > 'F')))
+		return false;
+	u8 l = ((cl >= '0') && (cl <= '9')) ? (cl - '0') : (cl - 'A' + 10);
+	u8 h = ((ch >= '0') && (ch <= '9')) ? (ch - '0') : (ch - 'A' + 10);
+	*res = l + (h << 4);
+	return true;
+}
+
+bool EcPoint::IsEqual(EcPoint& pnt)
+{
+	return this->x.IsEqual(pnt.x) && this->y.IsEqual(pnt.y);
+}
+
+void EcPoint::LoadFromBuffer64(u8* buffer)
+{
+	memcpy(x.data, buffer, 32);
+	x.data[4] = 0;
+	memcpy(y.data, buffer + 32, 32);
+	y.data[4] = 0;
+}
+
+void EcPoint::SaveToBuffer64(u8* buffer)
+{
+	memcpy(buffer, x.data, 32);
+	memcpy(buffer + 32, y.data, 32);
+}
+
+bool EcPoint::SetHexStr(const char* str)
+{
+	EcPoint res;
+	int len = (int)strlen(str);
+	if (len < 66)
+		return false;
+	u8 type, b;
+	if (!parse_u8(str, &type))
+		return false;
+	if ((type < 2) || (type > 4))
+		return false;
+	if (((type == 2) || (type == 3)) && (len != 66))
+		return false;
+	if ((type == 4) && (len != 130))
+		return false;
+
+	if (len == 66) //compressed
+	{
+		str += 2;
+		for (int i = 0; i < 32; i++)
+		{
+			if (!parse_u8(str + 2 * i, &b))
+				return false;
+			((u8*)res.x.data)[31 - i] = b;
+		}
+		res.y = Ec::CalcY(res.x, type == 2);
+		if (!Ec::IsValidPoint(res))
+			return false;		
+		*this = res;
+		return true;
+	}
+	//uncompressed
+	str += 2;
+	for (int i = 0; i < 32; i++)
+	{
+		if (!parse_u8(str + 2 * i, &b))
+			return false;
+		((u8*)res.x.data)[31 - i] = b;
+
+		if (!parse_u8(str + 2 * i + 64, &b))
+			return false;
+		((u8*)res.y.data)[31 - i] = b;
+	}
+	if (!Ec::IsValidPoint(res))
+		return false;
+	*this = res;
+	return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// https://en.bitcoin.it/wiki/Secp256k1
+void InitEc()
+{
+	g_P.SetHexStr("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F"); //Fp
+	g_G.x.SetHexStr("79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798"); //G.x
+	g_G.y.SetHexStr("483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8"); //G.y
+	g_N.SetHexStr("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141"); //order of G
+#ifdef DEBUG_MODE
+	GTable = (u8*)malloc(16 * 256 * 256 * 64);
+	EcPoint pnt = g_G;
+	for (int i = 0; i < 16; i++)
+	{
+		pnt.SaveToBuffer64(GTable + (i * 256 * 256) * 64);
+		EcPoint tmp = pnt;
+		pnt = Ec::DoublePoint(pnt);
+		for (int j = 1; j < 256 * 256 - 1; j++)
+		{
+			pnt.SaveToBuffer64(GTable + (i * 256 * 256 + j) * 64);
+			pnt = Ec::AddPoints(pnt, tmp);
+		}
+	}
+#endif
+};
+
+void DeInitEc()
+{
+#ifdef DEBUG_MODE
+	if (GTable)
+		free(GTable);
+#endif
+}
+
+// https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_addition
+EcPoint Ec::AddPoints(EcPoint& pnt1, EcPoint& pnt2)
+{
+	EcPoint res;
+	EcInt dx, dy, lambda, lambda2;
+
+	dx = pnt2.x;
+	dx.SubModP(pnt1.x);
+	dx.InvModP();
+
+	dy = pnt2.y;
+	dy.SubModP(pnt1.y);
+
+	lambda = dy;
+	lambda.MulModP(dx);
+	lambda2 = lambda;
+	lambda2.MulModP(lambda);
+
+	res.x = lambda2;
+	res.x.SubModP(pnt1.x);
+	res.x.SubModP(pnt2.x);
+
+	res.y = pnt2.x;
+	res.y.SubModP(res.x);
+	res.y.MulModP(lambda);
+	res.y.SubModP(pnt2.y);
+	return res;
+}
+
+// https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_doubling
+EcPoint Ec::DoublePoint(EcPoint& pnt)
+{
+	EcPoint res;
+	EcInt t1, t2, lambda, lambda2;
+
+	t1 = pnt.y;
+	t1.AddModP(pnt.y);
+	t1.InvModP();
+
+	t2 = pnt.x;
+	t2.MulModP(pnt.x);
+	lambda = t2;
+	lambda.AddModP(t2);
+	lambda.AddModP(t2);
+	lambda.MulModP(t1);
+	lambda2 = lambda;
+	lambda2.MulModP(lambda);
+
+	res.x = lambda2;
+	res.x.SubModP(pnt.x);
+	res.x.SubModP(pnt.x);
+
+	res.y = pnt.x;
+	res.y.SubModP(res.x);
+	res.y.MulModP(lambda);
+	res.y.SubModP(pnt.y);
+	return res;
+}
+
+//k up to 256 bits
+EcPoint Ec::MultiplyG(EcInt& k)
+{
+	EcPoint res;
+	EcPoint t = g_G;
+	bool first = true;
+	int n = 3;
+	while ((n >= 0) && !k.data[n])
+		n--;
+	if (n < 0)
+		return res; //error
+	int index;                     
+	_BitScanReverse64((DWORD*)&index, k.data[n]);
+	for (int i = 0; i <= 64 * n + index; i++)
+	{
+		u8 v = (k.data[i / 64] >> (i % 64)) & 1;	
+		if (v)
+		{
+			if (first)
+			{
+				first = false;
+				res = t;
+			}
+			else
+				res = Ec::AddPoints(res, t);
+		}
+		t = Ec::DoublePoint(t);
+	}
+	return res;
+}
+
+#ifdef DEBUG_MODE
+//uses gTable (16x16-bit) to speedup calculation
+EcPoint Ec::MultiplyG_Fast(EcInt& k)
+{
+	int i;
+	u16 b;
+	EcPoint pnt, res;
+	for (i = 0; i < 16; i++)
+	{
+		b = k.GetU16(i);
+		if (b)
+			break;
+	}
+	if (i >= 16)
+		return pnt;
+	if (i < 16)
+	{
+		res.LoadFromBuffer64(GTable + (256 * 256 * i + (b - 1)) * 64);
+		i++;
+	}
+	while (i < 16)
+	{
+		b = k.GetU16(i);
+		if (b)
+		{
+			pnt.LoadFromBuffer64(GTable + (256 * 256 * i + (b - 1)) * 64);
+			res = AddPoints(res, pnt);
+		}
+		i++;
+	}
+	return res;
+}
+#endif
+
+EcInt Ec::CalcY(EcInt& x, bool is_even)
+{
+	EcInt res;
+	EcInt tmp;
+	tmp.Set(7);
+	res = x;
+	res.MulModP(x);
+	res.MulModP(x);
+	res.AddModP(tmp);
+	res.SqrtModP();
+	if ((res.data[0] & 1) == is_even)
+		res.NegModP();
+	return res;
+}
+
+bool Ec::IsValidPoint(EcPoint& pnt)
+{
+	EcInt x, y, seven;
+	seven.Set(7);
+	x = pnt.x;
+	x.MulModP(pnt.x);
+	x.MulModP(pnt.x);
+	x.AddModP(seven);
+	y = pnt.y;
+	y.MulModP(pnt.y);
+	return x.IsEqual(y);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void Mul256_by_64(u64* input, u64 multiplier, u64* result)
+{
+	u64 h1, h2;
+	result[0] = _umul128(input[0], multiplier, &h1);
+	u8 carry = _addcarry_u64(0, _umul128(input[1], multiplier, &h2), h1, result + 1);
+	carry = _addcarry_u64(carry, _umul128(input[2], multiplier, &h1), h2, result + 2);
+	carry = _addcarry_u64(carry, _umul128(input[3], multiplier, &h2), h1, result + 3);
+	_addcarry_u64(carry, 0, h2, result + 4);
+}
+
+void Mul320_by_64(u64* input, u64 multiplier, u64* result)
+{
+	u64 h1, h2;
+	result[0] = _umul128(input[0], multiplier, &h1);
+	u8 carry = _addcarry_u64(0, _umul128(input[1], multiplier, &h2), h1, result + 1);
+	carry = _addcarry_u64(carry, _umul128(input[2], multiplier, &h1), h2, result + 2);
+	carry = _addcarry_u64(carry, _umul128(input[3], multiplier, &h2), h1, result + 3);
+	_addcarry_u64(carry, _umul128(input[4], multiplier, &h1), h2, result + 4);
+}
+
+void Add320_to_256(u64* in_out, u64* val)
+{
+	u8 c = _addcarry_u64(0, in_out[0], val[0], in_out);
+	c = _addcarry_u64(c, in_out[1], val[1], in_out + 1);
+	c = _addcarry_u64(c, in_out[2], val[2], in_out + 2);
+	c = _addcarry_u64(c, in_out[3], val[3], in_out + 3);
+	_addcarry_u64(c, 0, val[4], in_out + 4);
+}
+
+EcInt::EcInt()
+{
+	SetZero();
+}
+
+void EcInt::Assign(EcInt& val)
+{
+	memcpy(data, val.data, sizeof(data));
+}
+
+void EcInt::Set(u64 val)
+{
+	SetZero();
+	data[0] = val;
+}
+
+void EcInt::SetZero()
+{
+	memset(data, 0, sizeof(data));
+}
+
+bool EcInt::SetHexStr(const char* str)
+{
+	SetZero();
+	int len = (int)strlen(str);
+	if (len > 64)
+		return false;
+	char s[64];
+	memset(s, '0', 64);
+	memcpy(s + 64 - len, str, len);
+	for (int i = 0; i < 32; i++)
+	{
+		int n = 62 - 2 * i;
+		u8 b;
+		if (!parse_u8(s + n, &b))
+			return false;
+		((u8*)data)[i] = b;
+	}
+	return true;
+}
+
+void EcInt::GetHexStr(char* str)
+{
+	for (int i = 0; i < 32; i++)
+		sprintf(str + 2 * i, "%02X", ((u8*)data)[31 - i]);
+	str[64] = 0;
+}
+
+u16 EcInt::GetU16(int index)
+{
+	return (u16)(data[index / 4] >> (16 * (index % 4)));
+}
+
+//returns carry
+bool EcInt::Add(EcInt& val)
+{
+	u8 c = _addcarry_u64(0, data[0], val.data[0], data + 0);
+	c = _addcarry_u64(c, data[1], val.data[1], data + 1);
+	c = _addcarry_u64(c, data[2], val.data[2], data + 2);
+	c = _addcarry_u64(c, data[3], val.data[3], data + 3);
+	return _addcarry_u64(c, data[4], val.data[4], data + 4) != 0;
+}
+
+//returns carry
+bool EcInt::Sub(EcInt& val)
+{
+	u8 c = _subborrow_u64(0, data[0], val.data[0], data + 0);
+	c = _subborrow_u64(c, data[1], val.data[1], data + 1);
+	c = _subborrow_u64(c, data[2], val.data[2], data + 2);
+	c = _subborrow_u64(c, data[3], val.data[3], data + 3);
+	return _subborrow_u64(c, data[4], val.data[4], data + 4) != 0;
+}
+
+void EcInt::Neg()
+{
+	u8 c = _subborrow_u64(0, 0, data[0], data + 0);
+	c = _subborrow_u64(c, 0, data[1], data + 1);
+	c = _subborrow_u64(c, 0, data[2], data + 2);
+	c = _subborrow_u64(c, 0, data[3], data + 3);
+	_subborrow_u64(c, 0, data[4], data + 4);
+}
+
+void EcInt::Neg256()
+{
+	u8 c = _subborrow_u64(0, 0, data[0], data + 0);
+	c = _subborrow_u64(c, 0, data[1], data + 1);
+	c = _subborrow_u64(c, 0, data[2], data + 2);
+	c = _subborrow_u64(c, 0, data[3], data + 3);
+	data[4] = 0;
+}
+
+bool EcInt::IsLessThanU(EcInt& val)
+{
+	int i = 4;
+	while (i >= 0)
+	{
+		if (data[i] != val.data[i])
+			break;
+		i--;
+	}
+	if (i < 0)
+		return false;
+	return data[i] < val.data[i];
+}
+
+bool EcInt::IsLessThanI(EcInt& val)
+{
+	if ((data[4] >> 63) && !(val.data[4] >> 63))
+		return true;
+	if (!(data[4] >> 63) && (val.data[4] >> 63))
+		return false;
+
+	int i = 4;
+	while (i >= 0)
+	{
+		if (data[i] != val.data[i])
+			break;
+		i--;
+	}
+	if (i < 0)
+		return false;
+	return data[i] < val.data[i];
+}
+
+bool EcInt::IsEqual(EcInt& val)
+{
+	return memcmp(val.data, this->data, 40) == 0;
+}
+
+bool EcInt::IsZero()
+{
+	return ((data[0] == 0) && (data[1] == 0) && (data[2] == 0) && (data[3] == 0) && (data[4] == 0));
+}
+
+void EcInt::AddModP(EcInt& val)
+{
+	Add(val);
+	if (!IsLessThanU(g_P)) 
+		Sub(g_P);
+}
+
+void EcInt::SubModP(EcInt& val)
+{
+	if (Sub(val))
+		Add(g_P);
+}
+
+//assume value < P
+void EcInt::NegModP()
+{
+	Neg();
+	Add(g_P);
+}
+
+//assume value < N
+void EcInt::NegModN()
+{
+	Neg();
+	Add(g_N);
+}
+
+void EcInt::ShiftRight(int nbits)
+{
+	int offset = nbits / 64;
+	if (offset)
+	{
+		for (int i = 0; i < 5 - offset; i++)
+			data[i] = data[i + offset];
+		for (int i = 5 - offset; i < 5; i++)
+			data[i] = 0;
+		nbits -= 64 * offset;
+	}
+	data[0] = __shiftright128(data[0], data[1], nbits);
+	data[1] = __shiftright128(data[1], data[2], nbits);
+	data[2] = __shiftright128(data[2], data[3], nbits);
+	data[3] = __shiftright128(data[3], data[4], nbits);
+	data[4] = ((i64)data[4]) >> nbits;
+}
+
+void EcInt::ShiftLeft(int nbits)
+{
+	int offset = nbits / 64;
+	if (offset)
+	{
+		for (int i = 4; i >= offset; i--)
+			data[i] = data[i - offset];
+		for (int i = offset - 1; i >= 0; i--)
+			data[i] = 0;
+		nbits -= 64 * offset;
+	}
+	data[4] = __shiftleft128(data[3], data[4], nbits);
+	data[3] = __shiftleft128(data[2], data[3], nbits);
+	data[2] = __shiftleft128(data[1], data[2], nbits);
+	data[1] = __shiftleft128(data[0], data[1], nbits);
+	data[0] = data[0] << nbits;
+}
+
+void EcInt::MulModP(EcInt& val)
+{	
+	u64 buff[8], tmp[5], h;
+	//calc 512 bits
+	Mul256_by_64(val.data, data[0], buff);
+	Mul256_by_64(val.data, data[1], tmp);
+	Add320_to_256(buff + 1, tmp);
+	Mul256_by_64(val.data, data[2], tmp);
+	Add320_to_256(buff + 2, tmp);
+	Mul256_by_64(val.data, data[3], tmp);
+	Add320_to_256(buff + 3, tmp);
+	//fast mod P
+	Mul256_by_64(buff + 4, P_REV, tmp);
+	u8 c = _addcarry_u64(0, buff[0], tmp[0], buff);
+	c = _addcarry_u64(c, buff[1], tmp[1], buff + 1);
+	c = _addcarry_u64(c, buff[2], tmp[2], buff + 2);
+	tmp[4] += _addcarry_u64(c, buff[3], tmp[3], buff + 3);
+	c = _addcarry_u64(0, buff[0], _umul128(tmp[4], P_REV, &h), data);
+	c = _addcarry_u64(c, buff[1], h, data + 1);
+	c = _addcarry_u64(c, 0, buff[2], data + 2);
+	data[4] = _addcarry_u64(c, buff[3], 0, data + 3);
+}
+
+void EcInt::Mul_u64(EcInt& val, u64 multiplier)
+{
+	Assign(val);
+	Mul320_by_64(data, (u64)multiplier, data);
+}
+
+void EcInt::Mul_i64(EcInt& val, i64 multiplier)
+{
+	Assign(val);
+	if (multiplier < 0)
+	{
+		Neg();
+		multiplier = -multiplier;
+	}
+	Mul320_by_64(data, (u64)multiplier, data);
+}
+
+#define APPLY_DIV_SHIFT() kbnt -= index; val >>= index; matrix[0] <<= index; matrix[1] <<= index; 
+	
+// https://tches.iacr.org/index.php/TCHES/article/download/8298/7648/4494
+//a bit tricky
+void DIV_62(i64& kbnt, i64 modp, i64 val, i64* matrix)
+{
+	int index, cnt;
+	_BitScanForward64((DWORD*)&index, val | 0x4000000000000000);
+	APPLY_DIV_SHIFT();
+	cnt = 62 - index;
+	while (cnt > 0)
+	{
+		if (kbnt < 0)
+		{
+			kbnt = -kbnt;
+			i64 tmp = -modp; modp = val; val = tmp;
+			tmp = -matrix[0]; matrix[0] = matrix[2]; matrix[2] = tmp;
+			tmp = -matrix[1]; matrix[1] = matrix[3]; matrix[3] = tmp;
+		}
+		int thr = cnt;
+		if ((kbnt + 1) < cnt)
+			thr = (int)(kbnt + 1);
+		i64 mul = (-modp * val) & ((UINT64_MAX >> (64 - thr)) & 0x07);
+		val += (modp * mul);
+		matrix[2] += (matrix[0] * mul);
+		matrix[3] += (matrix[1] * mul);
+		_BitScanForward64((DWORD*)&index, val | (1ull << cnt));
+		APPLY_DIV_SHIFT();
+		cnt -= index;
+	}
+}
+
+void EcInt::InvModP()
+{
+	i64 matrix[4];
+	EcInt result, a, tmp, tmp2;
+	EcInt modp, val;
+	i64 kbnt = -1;
+	matrix[1] = matrix[2] = 0;
+	matrix[0] = matrix[3] = 1;	
+	DIV_62(kbnt, g_P.data[0], data[0], matrix);
+	modp.Mul_i64(g_P, matrix[0]);
+	tmp.Mul_i64(*this, matrix[1]);
+	modp.Add(tmp);
+	modp.ShiftRight(62);
+	val.Mul_i64(g_P, matrix[2]);
+	tmp.Mul_i64(*this, matrix[3]);
+	val.Add(tmp);
+	val.ShiftRight(62);
+	if (matrix[1] >= 0)
+		result.Set(matrix[1]);
+	else
+	{
+		result.Set(-matrix[1]);
+		result.Neg();
+	}
+	if (matrix[3] >= 0)
+		a.Set(matrix[3]);
+	else
+	{ 
+		a.Set(-matrix[3]);
+		a.Neg();
+	}
+	Mul320_by_64(g_P.data, (result.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp.data);
+	result.Add(tmp);
+	result.ShiftRight(62);
+	Mul320_by_64(g_P.data, (a.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp.data);
+	a.Add(tmp);
+	a.ShiftRight(62);
+	
+	while (val.data[0] || val.data[1] || val.data[2] || val.data[3])
+	{
+		matrix[1] = matrix[2] = 0;
+		matrix[0] = matrix[3] = 1;	
+		DIV_62(kbnt, modp.data[0], val.data[0], matrix);
+		tmp.Mul_i64(modp, matrix[0]);
+		tmp2.Mul_i64(val, matrix[1]);
+		tmp.Add(tmp2);
+		tmp2.Mul_i64(val, matrix[3]);
+		val.Mul_i64(modp, matrix[2]);
+		val.Add(tmp2);
+		val.ShiftRight(62);
+		modp = tmp;
+		modp.ShiftRight(62);
+		tmp.Mul_i64(result, matrix[0]);
+		tmp2.Mul_i64(a, matrix[1]);
+		tmp.Add(tmp2);
+		tmp2.Mul_i64(a, matrix[3]);
+		a.Mul_i64(result, matrix[2]);
+		a.Add(tmp2);
+		Mul320_by_64(g_P.data, (a.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp2.data);
+		a.Add(tmp2);
+		a.ShiftRight(62);	
+		Mul320_by_64(g_P.data, (tmp.data[0] * 0xD838091DD2253531) & 0x3FFFFFFFFFFFFFFF, tmp2.data);
+		result = tmp;
+		result.Add(tmp2);
+		result.ShiftRight(62);
+	}
+	Assign(result);
+	if (modp.data[4] >> 63)
+	{
+		Neg();
+		modp.Neg();	
+	}
+
+	if (modp.data[0] == 1) 
+	{
+		if (data[4] >> 63)
+			Add(g_P);
+		if (data[4] >> 63)
+			Add(g_P);
+		if (!IsLessThanU(g_P))
+			Sub(g_P);
+		if (!IsLessThanU(g_P))
+			Sub(g_P);
+	}
+	else
+		SetZero(); //error
+}
+
+// x = a^ { (p + 1) / 4 } mod p
+void EcInt::SqrtModP()
+{
+	EcInt one, res;
+	one.Set(1);
+	EcInt exp = g_P;
+	exp.Add(one);
+	exp.ShiftRight(2);
+	res.Set(1);
+	EcInt cur = *this;
+	while (!exp.IsZero())
+	{
+		if (exp.data[0] & 1)
+			res.MulModP(cur);
+		EcInt tmp = cur;
+		tmp.MulModP(cur);
+		cur = tmp;
+		exp.ShiftRight(1);
+	}
+	*this = res;
+}
+
+std::mt19937_64 rng;
+CriticalSection cs_rnd;
+
+void SetRndSeed(u64 seed)
+{
+	rng.seed(seed);
+}
+
+void EcInt::RndBits(int nbits)
+{
+	SetZero();
+	if (nbits > 256)
+		nbits = 256;
+	cs_rnd.Enter();
+	for (int i = 0; i < (nbits + 63) / 64; i++)
+		data[i] = rng();
+	cs_rnd.Leave();
+	data[nbits / 64] &= (1ull << (nbits % 64)) - 1;
+}
+
+//up to 256 bits only
+void EcInt::RndMax(EcInt& max)
+{
+	SetZero();
+	int n = 3;
+	while ((n >= 0) && !max.data[n])
+		n--;
+	if (n < 0)
+		return;
+	u64 val = max.data[n];
+	int k = 0;
+	while ((val & 0x8000000000000000) == 0)
+	{
+		val <<= 1;
+		k++;
+	}
+	int bits = 64 * n + (64 - k);
+	RndBits(bits);
+	while (!IsLessThanU(max)) // :)
+		RndBits(bits);
+}
+
+
+// ============================================================================
+// Extensión: w-NAF (w=4) para multiplicación escalar de G en CPU
+// - Implementa una versión canónica de w-NAF usando las primitivas afines
+//   ya existentes (AddPoints / DoublePoint). No toca el resto del proyecto.
+// ============================================================================
+
+namespace {
+    static inline bool limb_is_zero(const u64 a[4]) {
+        return (a[0]|a[1]|a[2]|a[3])==0ull;
+    }
+    static inline bool limb_is_odd(const u64 a[4]) {
+        return (a[0] & 1ull) != 0ull;
+    }
+    static inline void limb_sub_small(u64 a[4], unsigned v) {
+        unsigned long long b = v;
+        unsigned __int128 t = (unsigned __int128)a[0] - b;
+        a[0] = (u64)t;
+        unsigned __int128 borrow = (t >> 127) & 1;
+        for (int i=1;i<4 && borrow;i++) {
+            unsigned __int128 t2 = (unsigned __int128)a[i] - 1;
+            a[i] = (u64)t2;
+            borrow = (t2 >> 127) & 1;
+        }
+    }
+    static inline void limb_add_small(u64 a[4], unsigned v) {
+        unsigned __int128 t = (unsigned __int128)a[0] + (unsigned)v;
+        a[0] = (u64)t;
+        u64 carry = (u64)(t >> 64);
+        for (int i=1;i<4 && carry;i++) {
+            unsigned __int128 t2 = (unsigned __int128)a[i] + 1;
+            a[i] = (u64)t2;
+            carry = (u64)(t2 >> 64);
+        }
+    }
+    static inline void limb_shr1(u64 a[4]) {
+        u64 c = 0;
+        for (int i=3;i>=0;i--) {
+            u64 n = a[i];
+            a[i] = (n >> 1) | (c << 63);
+            c = n & 1ull;
+        }
+    }
+} // anonymous
+
+EcPoint EcEx::MultiplyG_WNAF4(const EcInt &k) {
+    // Precompute odd multiples of G: [1,3,5,7,9,11,13,15]G
+    EcPoint pre[8];
+    pre[0] = g_G;
+    EcPoint twoG = Ec::DoublePoint(g_G);
+    for (int i=1;i<8;i++) pre[i] = Ec::AddPoints(pre[i-1], twoG);
+
+    // Convert k to a 256-bit limb array
+    u64 kk[4] = {k.data[0], k.data[1], k.data[2], k.data[3]};
+
+    // w-NAF digit expansion
+    int digits[300]; int nd=0;
+    while (!limb_is_zero(kk)) {
+        int ui = 0;
+        if (limb_is_odd(kk)) {
+            int mod16 = (int)(kk[0] & 15ull);
+            if (mod16 > 8) ui = mod16 - 16; else ui = mod16;
+            if (ui >= 0) limb_sub_small(kk, (unsigned)ui);
+            else         limb_add_small(kk, (unsigned)(-ui));
+        }
+        digits[nd++] = ui;
+        limb_shr1(kk);
+    }
+
+    // Evaluate
+    bool R_is_inf = true;
+    EcPoint R; // undefined if inf
+    for (int i=nd-1;i>=0;i--) {
+        if (!R_is_inf) R = Ec::DoublePoint(R);
+        int ui = digits[i];
+        if (ui != 0) {
+            EcPoint add = pre[(abs(ui)-1)>>1];
+            if (ui < 0) { add.y.NegModP(); }
+            if (R_is_inf) { R = add; R_is_inf = false; }
+            else { R = Ec::AddPoints(R, add); }
+        }
+    }
+    if (R_is_inf) return EcPoint(); // zero
+    return R;
+}
diff --git a/Ec.h b/Ec.h
index fa123e3..3018dd7 100644
--- a/Ec.h
+++ b/Ec.h
@@ -1,77 +1,80 @@
-// This file is a part of RCKangaroo software
-// (c) 2024, RetiredCoder (RC)
-// License: GPLv3, see "LICENSE.TXT" file
-// https://github.com/RetiredC/Kang-2
-
-
-#pragma once
-
-#include "defs.h"
-#include "utils.h"
-
-class EcInt
-{
-public:
-	EcInt();
-
-	void Assign(EcInt& val);
-	void Set(u64 val);
-	void SetZero();
-	bool SetHexStr(const char* str);
-	void GetHexStr(char* str);
-	u16 GetU16(int index);
-
-	bool Add(EcInt& val); //returns true if carry
-	bool Sub(EcInt& val); //returns true if carry
-	void Neg();
-	void Neg256();
-	void ShiftRight(int nbits);
-	void ShiftLeft(int nbits);
-	bool IsLessThanU(EcInt& val);
-	bool IsLessThanI(EcInt& val);
-	bool IsEqual(EcInt& val);
-	bool IsZero();
-
-	void Mul_u64(EcInt& val, u64 multiplier);
-	void Mul_i64(EcInt& val, i64 multiplier);
-
-	void AddModP(EcInt& val);
-	void SubModP(EcInt& val);
-	void NegModP();
-	void MulModP(EcInt& val);
-	void InvModP();
-	void SqrtModP();
-
-	void RndBits(int nbits);
-	void RndMax(EcInt& max);
-
-	u64 data[4 + 1];
-};
-
-class EcPoint
-{
-public:
-	bool IsEqual(EcPoint& pnt);
-	void LoadFromBuffer64(u8* buffer);
-	void SaveToBuffer64(u8* buffer);
-	bool SetHexStr(const char* str);
-	EcInt x;
-	EcInt y;
-};
-
-class Ec
-{
-public:
-	static EcPoint AddPoints(EcPoint& pnt1, EcPoint& pnt2);
-	static EcPoint DoublePoint(EcPoint& pnt);
-	static EcPoint MultiplyG(EcInt& k);
-#ifdef DEBUG_MODE
-	static EcPoint MultiplyG_Fast(EcInt& k);
-#endif
-	static EcInt CalcY(EcInt& x, bool is_even);
-	static bool IsValidPoint(EcPoint& pnt);
-};
-
-void InitEc();
-void DeInitEc();
-void SetRndSeed(u64 seed);
\ No newline at end of file
+// This file is a part of RCKangaroo software
+// (c) 2024, RetiredCoder (RC)
+// License: GPLv3, see "LICENSE.TXT" file
+// https://github.com/RetiredC/Kang-2
+
+
+#pragma once
+
+#include "defs.h"
+#include "utils.h"
+
+class EcInt
+{
+public:
+	EcInt();
+
+	void Assign(EcInt& val);
+	void Set(u64 val);
+	void SetZero();
+	bool SetHexStr(const char* str);
+	void GetHexStr(char* str);
+	u16 GetU16(int index);
+
+	bool Add(EcInt& val); //returns true if carry
+	bool Sub(EcInt& val); //returns true if carry
+	void Neg();
+	void Neg256();
+	void ShiftRight(int nbits);
+	void ShiftLeft(int nbits);
+	bool IsLessThanU(EcInt& val);
+	bool IsLessThanI(EcInt& val);
+	bool IsEqual(EcInt& val);
+	bool IsZero();
+
+	void Mul_u64(EcInt& val, u64 multiplier);
+	void Mul_i64(EcInt& val, i64 multiplier);
+
+	void AddModP(EcInt& val);
+	void SubModP(EcInt& val);
+	void NegModP();
+	void NegModN();
+	void MulModP(EcInt& val);
+	void InvModP();
+	void SqrtModP();
+
+	void RndBits(int nbits);
+	void RndMax(EcInt& max);
+
+	u64 data[4 + 1];
+};
+
+class EcPoint
+{
+public:
+	bool IsEqual(EcPoint& pnt);
+	void LoadFromBuffer64(u8* buffer);
+	void SaveToBuffer64(u8* buffer);
+	bool SetHexStr(const char* str);
+	EcInt x;
+	EcInt y;
+};
+
+class Ec
+{
+public:
+	static EcPoint AddPoints(EcPoint& pnt1, EcPoint& pnt2);
+	static EcPoint DoublePoint(EcPoint& pnt);
+	static EcPoint MultiplyG(EcInt& k);
+#ifdef DEBUG_MODE
+	static EcPoint MultiplyG_Fast(EcInt& k);
+#endif
+	static EcInt CalcY(EcInt& x, bool is_even);
+	static bool IsValidPoint(EcPoint& pnt);
+};
+
+void InitEc();
+void DeInitEc();
+void SetRndSeed(u64 seed);
+
+class EcEx { public: static EcPoint MultiplyG_WNAF4(const EcInt &k); };
diff --git a/GpuKang.cpp b/GpuKang.cpp
index f792d8c..d9b1bcd 100644
--- a/GpuKang.cpp
+++ b/GpuKang.cpp
@@ -1,517 +1,522 @@
-// This file is a part of RCKangaroo software
-// (c) 2024, RetiredCoder (RC)
-// License: GPLv3, see "LICENSE.TXT" file
-// https://github.com/RetiredC
-
-
-#include <iostream>
-#include "cuda_runtime.h"
-#include "cuda.h"
-
-#include "GpuKang.h"
-
-cudaError_t cuSetGpuParams(TKparams Kparams, u64* _jmp2_table);
-void CallGpuKernelGen(TKparams Kparams);
-void CallGpuKernelABC(TKparams Kparams);
-void AddPointsToList(u32* data, int cnt, u64 ops_cnt);
-extern bool gGenMode; //tames generation mode
-
-int RCGpuKang::CalcKangCnt()
-{
-	Kparams.BlockCnt = mpCnt;
-	Kparams.BlockSize = IsOldGpu ? 512 : 256;
-	Kparams.GroupCnt = IsOldGpu ? 64 : 24;
-	return Kparams.BlockSize* Kparams.GroupCnt* Kparams.BlockCnt;
-}
-
-//executes in main thread
-bool RCGpuKang::Prepare(EcPoint _PntToSolve, int _Range, int _DP, EcJMP* _EcJumps1, EcJMP* _EcJumps2, EcJMP* _EcJumps3)
-{
-	PntToSolve = _PntToSolve;
-	Range = _Range;
-	DP = _DP;
-	EcJumps1 = _EcJumps1;
-	EcJumps2 = _EcJumps2;
-	EcJumps3 = _EcJumps3;
-	StopFlag = false;
-	Failed = false;
-	u64 total_mem = 0;
-	memset(dbg, 0, sizeof(dbg));
-	memset(SpeedStats, 0, sizeof(SpeedStats));
-	cur_stats_ind = 0;
-
-	cudaError_t err;
-	err = cudaSetDevice(CudaIndex);
-	if (err != cudaSuccess)
-		return false;
-
-	Kparams.BlockCnt = mpCnt;
-	Kparams.BlockSize = IsOldGpu ? 512 : 256;
-	Kparams.GroupCnt = IsOldGpu ? 64 : 24;
-	KangCnt = Kparams.BlockSize * Kparams.GroupCnt * Kparams.BlockCnt;
-	Kparams.KangCnt = KangCnt;
-	Kparams.DP = DP;
-	Kparams.KernelA_LDS_Size = 64 * JMP_CNT + 16 * Kparams.BlockSize;
-	Kparams.KernelB_LDS_Size = 64 * JMP_CNT;
-	Kparams.KernelC_LDS_Size = 96 * JMP_CNT;
-	Kparams.IsGenMode = gGenMode;
-
-//allocate gpu mem
-	u64 size;
-	if (!IsOldGpu)
-	{
-		//L2	
-		int L2size = Kparams.KangCnt * (3 * 32);
-		total_mem += L2size;
-		err = cudaMalloc((void**)&Kparams.L2, L2size);
-		if (err != cudaSuccess)
-		{
-			printf("GPU %d, Allocate L2 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-			return false;
-		}
-		size = L2size;
-		if (size > persistingL2CacheMaxSize)
-			size = persistingL2CacheMaxSize;
-		err = cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, size); // set max allowed size for L2
-		//persisting for L2
-		cudaStreamAttrValue stream_attribute;                                                   
-		stream_attribute.accessPolicyWindow.base_ptr = Kparams.L2;
-		stream_attribute.accessPolicyWindow.num_bytes = size;										
-		stream_attribute.accessPolicyWindow.hitRatio = 1.0;                                     
-		stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;             
-		stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;  	
-		err = cudaStreamSetAttribute(NULL, cudaStreamAttributeAccessPolicyWindow, &stream_attribute);
-		if (err != cudaSuccess)
-		{
-			printf("GPU %d, cudaStreamSetAttribute failed: %s\n", CudaIndex, cudaGetErrorString(err));
-			return false;
-		}
-	}
-	size = MAX_DP_CNT * GPU_DP_SIZE + 16;
-	total_mem += size;
-	err = cudaMalloc((void**)&Kparams.DPs_out, size);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate GpuOut memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	size = KangCnt * 96;
-	total_mem += size;
-	err = cudaMalloc((void**)&Kparams.Kangs, size);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate pKangs memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	total_mem += JMP_CNT * 96;
-	err = cudaMalloc((void**)&Kparams.Jumps1, JMP_CNT * 96);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate Jumps1 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	total_mem += JMP_CNT * 96;
-	err = cudaMalloc((void**)&Kparams.Jumps2, JMP_CNT * 96);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate Jumps1 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	total_mem += JMP_CNT * 96;
-	err = cudaMalloc((void**)&Kparams.Jumps3, JMP_CNT * 96);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate Jumps3 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	size = 2 * (u64)KangCnt * STEP_CNT;
-	total_mem += size;
-	err = cudaMalloc((void**)&Kparams.JumpsList, size);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate JumpsList memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	size = (u64)KangCnt * (16 * DPTABLE_MAX_CNT + sizeof(u32)); //we store 16bytes of X
-	total_mem += size;
-	err = cudaMalloc((void**)&Kparams.DPTable, size);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate DPTable memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	size = mpCnt * Kparams.BlockSize * sizeof(u64);
-	total_mem += size;
-	err = cudaMalloc((void**)&Kparams.L1S2, size);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate L1S2 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	size = (u64)KangCnt * MD_LEN * (2 * 32);
-	total_mem += size;
-	err = cudaMalloc((void**)&Kparams.LastPnts, size);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate LastPnts memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	size = (u64)KangCnt * MD_LEN * sizeof(u64);
-	total_mem += size;
-	err = cudaMalloc((void**)&Kparams.LoopTable, size);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate LastPnts memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	total_mem += 1024;
-	err = cudaMalloc((void**)&Kparams.dbg_buf, 1024);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate dbg_buf memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	size = sizeof(u32) * KangCnt + 8;
-	total_mem += size;
-	err = cudaMalloc((void**)&Kparams.LoopedKangs, size);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d Allocate LoopedKangs memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-
-	DPs_out = (u32*)malloc(MAX_DP_CNT * GPU_DP_SIZE);
-
-//jmp1
-	u64* buf = (u64*)malloc(JMP_CNT * 96);
-	for (int i = 0; i < JMP_CNT; i++)
-	{
-		memcpy(buf + i * 12, EcJumps1[i].p.x.data, 32);
-		memcpy(buf + i * 12 + 4, EcJumps1[i].p.y.data, 32);
-		memcpy(buf + i * 12 + 8, EcJumps1[i].dist.data, 32);
-	}
-	err = cudaMemcpy(Kparams.Jumps1, buf, JMP_CNT * 96, cudaMemcpyHostToDevice);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d, cudaMemcpy Jumps1 failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-	free(buf);
-//jmp2
-	buf = (u64*)malloc(JMP_CNT * 96);
-	u64* jmp2_table = (u64*)malloc(JMP_CNT * 64);
-	for (int i = 0; i < JMP_CNT; i++)
-	{
-		memcpy(buf + i * 12, EcJumps2[i].p.x.data, 32);
-		memcpy(jmp2_table + i * 8, EcJumps2[i].p.x.data, 32);
-		memcpy(buf + i * 12 + 4, EcJumps2[i].p.y.data, 32);
-		memcpy(jmp2_table + i * 8 + 4, EcJumps2[i].p.y.data, 32);
-		memcpy(buf + i * 12 + 8, EcJumps2[i].dist.data, 32);
-	}
-	err = cudaMemcpy(Kparams.Jumps2, buf, JMP_CNT * 96, cudaMemcpyHostToDevice);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d, cudaMemcpy Jumps2 failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-	free(buf);
-
-	err = cuSetGpuParams(Kparams, jmp2_table);
-	if (err != cudaSuccess)
-	{
-		free(jmp2_table);
-		printf("GPU %d, cuSetGpuParams failed: %s!\r\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-	free(jmp2_table);
-//jmp3
-	buf = (u64*)malloc(JMP_CNT * 96);
-	for (int i = 0; i < JMP_CNT; i++)
-	{
-		memcpy(buf + i * 12, EcJumps3[i].p.x.data, 32);
-		memcpy(buf + i * 12 + 4, EcJumps3[i].p.y.data, 32);
-		memcpy(buf + i * 12 + 8, EcJumps3[i].dist.data, 32);
-	}
-	err = cudaMemcpy(Kparams.Jumps3, buf, JMP_CNT * 96, cudaMemcpyHostToDevice);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d, cudaMemcpy Jumps3 failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-	free(buf);
-
-	printf("GPU %d: allocated %llu MB, %d kangaroos. OldGpuMode: %s\r\n", CudaIndex, total_mem / (1024 * 1024), KangCnt, IsOldGpu ? "Yes" : "No");
-	return true;
-}
-
-void RCGpuKang::Release()
-{
-	free(RndPnts);
-	free(DPs_out);
-	cudaFree(Kparams.LoopedKangs);
-	cudaFree(Kparams.dbg_buf);
-	cudaFree(Kparams.LoopTable);
-	cudaFree(Kparams.LastPnts);
-	cudaFree(Kparams.L1S2);
-	cudaFree(Kparams.DPTable);
-	cudaFree(Kparams.JumpsList);
-	cudaFree(Kparams.Jumps3);
-	cudaFree(Kparams.Jumps2);
-	cudaFree(Kparams.Jumps1);
-	cudaFree(Kparams.Kangs);
-	cudaFree(Kparams.DPs_out);
-	if (!IsOldGpu)
-		cudaFree(Kparams.L2);
-}
-
-void RCGpuKang::Stop()
-{
-	StopFlag = true;
-}
-
-void RCGpuKang::GenerateRndDistances()
-{
-	for (int i = 0; i < KangCnt; i++)
-	{
-		EcInt d;
-		if (i < KangCnt / 3)
-			d.RndBits(Range - 4); //TAME kangs
-		else
-		{
-			d.RndBits(Range - 1);
-			d.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even
-		}
-		memcpy(RndPnts[i].priv, d.data, 24);
-	}
-}
-
-bool RCGpuKang::Start()
-{
-	if (Failed)
-		return false;
-
-	cudaError_t err;
-	err = cudaSetDevice(CudaIndex);
-	if (err != cudaSuccess)
-		return false;
-
-	HalfRange.Set(1);
-	HalfRange.ShiftLeft(Range - 1);
-	PntHalfRange = ec.MultiplyG(HalfRange);
-	NegPntHalfRange = PntHalfRange;
-	NegPntHalfRange.y.NegModP();
-
-	PntA = ec.AddPoints(PntToSolve, NegPntHalfRange);
-	PntB = PntA;
-	PntB.y.NegModP();
-
-	RndPnts = (TPointPriv*)malloc(KangCnt * 96);
-	GenerateRndDistances();
-/* 
-	//we can calc start points on CPU
-	for (int i = 0; i < KangCnt; i++)
-	{
-		EcInt d;
-		memcpy(d.data, RndPnts[i].priv, 24);
-		d.data[3] = 0;
-		d.data[4] = 0;
-		EcPoint p = ec.MultiplyG(d);
-		memcpy(RndPnts[i].x, p.x.data, 32);
-		memcpy(RndPnts[i].y, p.y.data, 32);
-	}
-	for (int i = KangCnt / 3; i < 2 * KangCnt / 3; i++)
-	{
-		EcPoint p;
-		p.LoadFromBuffer64((u8*)RndPnts[i].x);
-		p = ec.AddPoints(p, PntA);
-		p.SaveToBuffer64((u8*)RndPnts[i].x);
-	}
-	for (int i = 2 * KangCnt / 3; i < KangCnt; i++)
-	{
-		EcPoint p;
-		p.LoadFromBuffer64((u8*)RndPnts[i].x);
-		p = ec.AddPoints(p, PntB);
-		p.SaveToBuffer64((u8*)RndPnts[i].x);
-	}
-	//copy to gpu
-	err = cudaMemcpy(Kparams.Kangs, RndPnts, KangCnt * 96, cudaMemcpyHostToDevice);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d, cudaMemcpy failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-/**/
-	//but it's faster to calc then on GPU
-	u8 buf_PntA[64], buf_PntB[64];
-	PntA.SaveToBuffer64(buf_PntA);
-	PntB.SaveToBuffer64(buf_PntB);
-	for (int i = 0; i < KangCnt; i++)
-	{
-		if (i < KangCnt / 3)
-			memset(RndPnts[i].x, 0, 64);
-		else
-			if (i < 2 * KangCnt / 3)
-				memcpy(RndPnts[i].x, buf_PntA, 64);
-			else
-				memcpy(RndPnts[i].x, buf_PntB, 64);
-	}
-	//copy to gpu
-	err = cudaMemcpy(Kparams.Kangs, RndPnts, KangCnt * 96, cudaMemcpyHostToDevice);
-	if (err != cudaSuccess)
-	{
-		printf("GPU %d, cudaMemcpy failed: %s\n", CudaIndex, cudaGetErrorString(err));
-		return false;
-	}
-	CallGpuKernelGen(Kparams);
-
-	err = cudaMemset(Kparams.L1S2, 0, mpCnt * Kparams.BlockSize * 8);
-	if (err != cudaSuccess)
-		return false;
-	cudaMemset(Kparams.dbg_buf, 0, 1024);
-	cudaMemset(Kparams.LoopTable, 0, KangCnt * MD_LEN * sizeof(u64));
-	return true;
-}
-
-#ifdef DEBUG_MODE
-int RCGpuKang::Dbg_CheckKangs()
-{
-	int kang_size = mpCnt * Kparams.BlockSize * Kparams.GroupCnt * 96;
-	u64* kangs = (u64*)malloc(kang_size);
-	cudaError_t err = cudaMemcpy(kangs, Kparams.Kangs, kang_size, cudaMemcpyDeviceToHost);
-	int res = 0;
-	for (int i = 0; i < KangCnt; i++)
-	{
-		EcPoint Pnt, p;
-		Pnt.LoadFromBuffer64((u8*)&kangs[i * 12 + 0]);
-		EcInt dist;
-		dist.Set(0);
-		memcpy(dist.data, &kangs[i * 12 + 8], 24);
-		bool neg = false;
-		if (dist.data[2] >> 63)
-		{
-			neg = true;
-			memset(((u8*)dist.data) + 24, 0xFF, 16);
-			dist.Neg();
-		}
-		p = ec.MultiplyG_Fast(dist);
-		if (neg)
-			p.y.NegModP();
-		if (i < KangCnt / 3)
-			p = p;
-		else
-			if (i < 2 * KangCnt / 3)
-				p = ec.AddPoints(PntA, p);
-			else
-				p = ec.AddPoints(PntB, p);
-		if (!p.IsEqual(Pnt))
-			res++;
-	}
-	free(kangs);
-	return res;
-}
-#endif
-
-extern u32 gTotalErrors;
-
-//executes in separate thread
-void RCGpuKang::Execute()
-{
-	cudaSetDevice(CudaIndex);
-
-	if (!Start())
-	{
-		gTotalErrors++;
-		return;
-	}
-#ifdef DEBUG_MODE
-	u64 iter = 1;
-#endif
-	cudaError_t err;	
-	while (!StopFlag)
-	{
-		u64 t1 = GetTickCount64();
-		cudaMemset(Kparams.DPs_out, 0, 4);
-		cudaMemset(Kparams.DPTable, 0, KangCnt * sizeof(u32));
-		cudaMemset(Kparams.LoopedKangs, 0, 8);
-		CallGpuKernelABC(Kparams);
-		int cnt;
-		err = cudaMemcpy(&cnt, Kparams.DPs_out, 4, cudaMemcpyDeviceToHost);
-		if (err != cudaSuccess)
-		{
-			printf("GPU %d, CallGpuKernel failed: %s\r\n", CudaIndex, cudaGetErrorString(err));
-			gTotalErrors++;
-			break;
-		}
-		
-		if (cnt >= MAX_DP_CNT)
-		{
-			cnt = MAX_DP_CNT;
-			printf("GPU %d, gpu DP buffer overflow, some points lost, increase DP value!\r\n", CudaIndex);
-		}
-		u64 pnt_cnt = (u64)KangCnt * STEP_CNT;
-
-		if (cnt)
-		{
-			err = cudaMemcpy(DPs_out, Kparams.DPs_out + 4, cnt * GPU_DP_SIZE, cudaMemcpyDeviceToHost);
-			if (err != cudaSuccess)
-			{
-				gTotalErrors++;
-				break;
-			}
-			AddPointsToList(DPs_out, cnt, (u64)KangCnt * STEP_CNT);
-		}
-
-		//dbg
-		cudaMemcpy(dbg, Kparams.dbg_buf, 1024, cudaMemcpyDeviceToHost);
-
-		u32 lcnt;
-		cudaMemcpy(&lcnt, Kparams.LoopedKangs, 4, cudaMemcpyDeviceToHost);
-		//printf("GPU %d, Looped: %d\r\n", CudaIndex, lcnt);
-
-		u64 t2 = GetTickCount64();
-		u64 tm = t2 - t1;
-		if (!tm)
-			tm = 1;
-		int cur_speed = (int)(pnt_cnt / (tm * 1000));
-		//printf("GPU %d kernel time %d ms, speed %d MH\r\n", CudaIndex, (int)tm, cur_speed);
-
-		SpeedStats[cur_stats_ind] = cur_speed;
-		cur_stats_ind = (cur_stats_ind + 1) % STATS_WND_SIZE;
-
-#ifdef DEBUG_MODE
-		if ((iter % 300) == 0)
-		{
-			int corr_cnt = Dbg_CheckKangs();
-			if (corr_cnt)
-			{
-				printf("DBG: GPU %d, KANGS CORRUPTED: %d\r\n", CudaIndex, corr_cnt);
-				gTotalErrors++;
-			}
-			else
-				printf("DBG: GPU %d, ALL KANGS OK!\r\n", CudaIndex);
-		}
-		iter++;
-#endif
-	}
-
-	Release();
-}
-
-int RCGpuKang::GetStatsSpeed()
-{
-	int res = SpeedStats[0];
-	for (int i = 1; i < STATS_WND_SIZE; i++)
-		res += SpeedStats[i];
-	return res / STATS_WND_SIZE;
+// This file is a part of RCKangaroo software
+// (c) 2024, RetiredCoder (RC)
+// License: GPLv3, see "LICENSE.TXT" file
+// https://github.com/RetiredC
+
+
+#include <iostream>
+#include "cuda_runtime.h"
+#include "cuda.h"
+
+#include "GpuKang.h"
+extern int gTameRatioPct;
+extern int gTameBitsOffset;
+
+cudaError_t cuSetGpuParams(TKparams Kparams, u64* _jmp2_table);
+void CallGpuKernelGen(TKparams Kparams);
+void CallGpuKernelABC(TKparams Kparams);
+void AddPointsToList(u32* data, int cnt, u64 ops_cnt);
+extern bool gGenMode; //tames generation mode
+
+int RCGpuKang::CalcKangCnt()
+{
+	Kparams.BlockCnt = mpCnt;
+	Kparams.BlockSize = IsOldGpu ? 512 : 256;
+	Kparams.GroupCnt = IsOldGpu ? 64 : 24;
+	return Kparams.BlockSize* Kparams.GroupCnt* Kparams.BlockCnt;
+}
+
+//executes in main thread
+bool RCGpuKang::Prepare(EcPoint _PntToSolve, int _Range, int _DP, EcJMP* _EcJumps1, EcJMP* _EcJumps2, EcJMP* _EcJumps3)
+{
+	PntToSolve = _PntToSolve;
+	Range = _Range;
+	DP = _DP;
+	EcJumps1 = _EcJumps1;
+	EcJumps2 = _EcJumps2;
+	EcJumps3 = _EcJumps3;
+	StopFlag = false;
+	Failed = false;
+	u64 total_mem = 0;
+	memset(dbg, 0, sizeof(dbg));
+	memset(SpeedStats, 0, sizeof(SpeedStats));
+	cur_stats_ind = 0;
+
+	cudaError_t err;
+	err = cudaSetDevice(CudaIndex);
+	if (err != cudaSuccess)
+		return false;
+
+	Kparams.BlockCnt = mpCnt;
+	Kparams.BlockSize = IsOldGpu ? 512 : 256;
+	Kparams.GroupCnt = IsOldGpu ? 64 : 24;
+	KangCnt = Kparams.BlockSize * Kparams.GroupCnt * Kparams.BlockCnt;
+	Kparams.KangCnt = KangCnt;
+	Kparams.DP = DP;
+	Kparams.KernelA_LDS_Size = 64 * JMP_CNT + 16 * Kparams.BlockSize;
+	Kparams.KernelB_LDS_Size = 64 * JMP_CNT;
+	Kparams.KernelC_LDS_Size = 96 * JMP_CNT;
+	Kparams.IsGenMode = gGenMode;
+
+//allocate gpu mem
+	u64 size;
+	if (!IsOldGpu)
+	{
+		//L2	
+		int L2size = Kparams.KangCnt * (3 * 32);
+		total_mem += L2size;
+		err = cudaMalloc((void**)&Kparams.L2, L2size);
+		if (err != cudaSuccess)
+		{
+			printf("GPU %d, Allocate L2 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+			return false;
+		}
+		size = L2size;
+		if (size > persistingL2CacheMaxSize)
+			size = persistingL2CacheMaxSize;
+		err = cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, size); // set max allowed size for L2
+		//persisting for L2
+		cudaStreamAttrValue stream_attribute;                                                   
+		stream_attribute.accessPolicyWindow.base_ptr = Kparams.L2;
+		stream_attribute.accessPolicyWindow.num_bytes = size;										
+		stream_attribute.accessPolicyWindow.hitRatio = 1.0;                                     
+		stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;             
+		stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;  	
+		err = cudaStreamSetAttribute(NULL, cudaStreamAttributeAccessPolicyWindow, &stream_attribute);
+		if (err != cudaSuccess)
+		{
+			printf("GPU %d, cudaStreamSetAttribute failed: %s\n", CudaIndex, cudaGetErrorString(err));
+			return false;
+		}
+	}
+	size = MAX_DP_CNT * GPU_DP_SIZE + 16;
+	total_mem += size;
+	err = cudaMalloc((void**)&Kparams.DPs_out, size);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate GpuOut memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	size = KangCnt * 96;
+	total_mem += size;
+	err = cudaMalloc((void**)&Kparams.Kangs, size);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate pKangs memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	total_mem += JMP_CNT * 96;
+	err = cudaMalloc((void**)&Kparams.Jumps1, JMP_CNT * 96);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate Jumps1 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	total_mem += JMP_CNT * 96;
+	err = cudaMalloc((void**)&Kparams.Jumps2, JMP_CNT * 96);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate Jumps1 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	total_mem += JMP_CNT * 96;
+	err = cudaMalloc((void**)&Kparams.Jumps3, JMP_CNT * 96);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate Jumps3 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	size = 2 * (u64)KangCnt * STEP_CNT;
+	total_mem += size;
+	err = cudaMalloc((void**)&Kparams.JumpsList, size);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate JumpsList memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	size = (u64)KangCnt * (16 * DPTABLE_MAX_CNT + sizeof(u32)); //we store 16bytes of X
+	total_mem += size;
+	err = cudaMalloc((void**)&Kparams.DPTable, size);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate DPTable memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	size = mpCnt * Kparams.BlockSize * sizeof(u64);
+	total_mem += size;
+	err = cudaMalloc((void**)&Kparams.L1S2, size);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate L1S2 memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	size = (u64)KangCnt * MD_LEN * (2 * 32);
+	total_mem += size;
+	err = cudaMalloc((void**)&Kparams.LastPnts, size);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate LastPnts memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	size = (u64)KangCnt * MD_LEN * sizeof(u64);
+	total_mem += size;
+	err = cudaMalloc((void**)&Kparams.LoopTable, size);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate LastPnts memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	total_mem += 1024;
+	err = cudaMalloc((void**)&Kparams.dbg_buf, 1024);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate dbg_buf memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	size = sizeof(u32) * KangCnt + 8;
+	total_mem += size;
+	err = cudaMalloc((void**)&Kparams.LoopedKangs, size);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d Allocate LoopedKangs memory failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+
+	DPs_out = (u32*)malloc(MAX_DP_CNT * GPU_DP_SIZE);
+
+//jmp1
+	u64* buf = (u64*)malloc(JMP_CNT * 96);
+	for (int i = 0; i < JMP_CNT; i++)
+	{
+		memcpy(buf + i * 12, EcJumps1[i].p.x.data, 32);
+		memcpy(buf + i * 12 + 4, EcJumps1[i].p.y.data, 32);
+		memcpy(buf + i * 12 + 8, EcJumps1[i].dist.data, 32);
+	}
+	err = cudaMemcpy(Kparams.Jumps1, buf, JMP_CNT * 96, cudaMemcpyHostToDevice);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d, cudaMemcpy Jumps1 failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+	free(buf);
+//jmp2
+	buf = (u64*)malloc(JMP_CNT * 96);
+	u64* jmp2_table = (u64*)malloc(JMP_CNT * 64);
+	for (int i = 0; i < JMP_CNT; i++)
+	{
+		memcpy(buf + i * 12, EcJumps2[i].p.x.data, 32);
+		memcpy(jmp2_table + i * 8, EcJumps2[i].p.x.data, 32);
+		memcpy(buf + i * 12 + 4, EcJumps2[i].p.y.data, 32);
+		memcpy(jmp2_table + i * 8 + 4, EcJumps2[i].p.y.data, 32);
+		memcpy(buf + i * 12 + 8, EcJumps2[i].dist.data, 32);
+	}
+	err = cudaMemcpy(Kparams.Jumps2, buf, JMP_CNT * 96, cudaMemcpyHostToDevice);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d, cudaMemcpy Jumps2 failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+	free(buf);
+
+	err = cuSetGpuParams(Kparams, jmp2_table);
+	if (err != cudaSuccess)
+	{
+		free(jmp2_table);
+		printf("GPU %d, cuSetGpuParams failed: %s!\r\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+	free(jmp2_table);
+//jmp3
+	buf = (u64*)malloc(JMP_CNT * 96);
+	for (int i = 0; i < JMP_CNT; i++)
+	{
+		memcpy(buf + i * 12, EcJumps3[i].p.x.data, 32);
+		memcpy(buf + i * 12 + 4, EcJumps3[i].p.y.data, 32);
+		memcpy(buf + i * 12 + 8, EcJumps3[i].dist.data, 32);
+	}
+	err = cudaMemcpy(Kparams.Jumps3, buf, JMP_CNT * 96, cudaMemcpyHostToDevice);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d, cudaMemcpy Jumps3 failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+	free(buf);
+
+	printf("GPU %d: allocated %llu MB, %d kangaroos. OldGpuMode: %s\r\n", CudaIndex, total_mem / (1024 * 1024), KangCnt, IsOldGpu ? "Yes" : "No");
+	return true;
+}
+
+void RCGpuKang::Release()
+{
+	free(RndPnts);
+	free(DPs_out);
+	cudaFree(Kparams.LoopedKangs);
+	cudaFree(Kparams.dbg_buf);
+	cudaFree(Kparams.LoopTable);
+	cudaFree(Kparams.LastPnts);
+	cudaFree(Kparams.L1S2);
+	cudaFree(Kparams.DPTable);
+	cudaFree(Kparams.JumpsList);
+	cudaFree(Kparams.Jumps3);
+	cudaFree(Kparams.Jumps2);
+	cudaFree(Kparams.Jumps1);
+	cudaFree(Kparams.Kangs);
+	cudaFree(Kparams.DPs_out);
+	if (!IsOldGpu)
+		cudaFree(Kparams.L2);
+}
+
+void RCGpuKang::Stop()
+{
+	StopFlag = true;
+}
+
+void RCGpuKang::GenerateRndDistances()
+{
+	for (int i = 0; i < KangCnt; i++)
+	{
+		EcInt d;
+		int tameBorder = (KangCnt * gTameRatioPct) / 100;
+		int tameBits = Range - gTameBitsOffset;
+		if (tameBits < 1) tameBits = 1;
+		if (i < tameBorder)
+			d.RndBits(tameBits); // TAME kangs
+		else
+		{
+			d.RndBits(Range - 1);
+			d.data[0] &= 0xFFFFFFFFFFFFFFFE; // must be even
+		}
+		memcpy(RndPnts[i].priv, d.data, 24);
+	}
+}
+
+bool RCGpuKang::Start()
+{
+	if (Failed)
+		return false;
+
+	cudaError_t err;
+	err = cudaSetDevice(CudaIndex);
+	if (err != cudaSuccess)
+		return false;
+
+	HalfRange.Set(1);
+	HalfRange.ShiftLeft(Range - 1);
+	PntHalfRange = ec.MultiplyG(HalfRange);
+	NegPntHalfRange = PntHalfRange;
+	NegPntHalfRange.y.NegModP();
+
+	PntA = ec.AddPoints(PntToSolve, NegPntHalfRange);
+	PntB = PntA;
+	PntB.y.NegModP();
+
+	RndPnts = (TPointPriv*)malloc(KangCnt * 96);
+	GenerateRndDistances();
+/* 
+	//we can calc start points on CPU
+	for (int i = 0; i < KangCnt; i++)
+	{
+		EcInt d;
+		memcpy(d.data, RndPnts[i].priv, 24);
+		d.data[3] = 0;
+		d.data[4] = 0;
+		EcPoint p = ec.MultiplyG(d);
+		memcpy(RndPnts[i].x, p.x.data, 32);
+		memcpy(RndPnts[i].y, p.y.data, 32);
+	}
+	for (int i = KangCnt / 3; i < 2 * KangCnt / 3; i++)
+	{
+		EcPoint p;
+		p.LoadFromBuffer64((u8*)RndPnts[i].x);
+		p = ec.AddPoints(p, PntA);
+		p.SaveToBuffer64((u8*)RndPnts[i].x);
+	}
+	for (int i = 2 * KangCnt / 3; i < KangCnt; i++)
+	{
+		EcPoint p;
+		p.LoadFromBuffer64((u8*)RndPnts[i].x);
+		p = ec.AddPoints(p, PntB);
+		p.SaveToBuffer64((u8*)RndPnts[i].x);
+	}
+	//copy to gpu
+	err = cudaMemcpy(Kparams.Kangs, RndPnts, KangCnt * 96, cudaMemcpyHostToDevice);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d, cudaMemcpy failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+/**/
+	//but it's faster to calc then on GPU
+	u8 buf_PntA[64], buf_PntB[64];
+	PntA.SaveToBuffer64(buf_PntA);
+	PntB.SaveToBuffer64(buf_PntB);
+	for (int i = 0; i < KangCnt; i++)
+	{
+		if (i < KangCnt / 3)
+			memset(RndPnts[i].x, 0, 64);
+		else
+			if (i < 2 * KangCnt / 3)
+				memcpy(RndPnts[i].x, buf_PntA, 64);
+			else
+				memcpy(RndPnts[i].x, buf_PntB, 64);
+	}
+	//copy to gpu
+	err = cudaMemcpy(Kparams.Kangs, RndPnts, KangCnt * 96, cudaMemcpyHostToDevice);
+	if (err != cudaSuccess)
+	{
+		printf("GPU %d, cudaMemcpy failed: %s\n", CudaIndex, cudaGetErrorString(err));
+		return false;
+	}
+	CallGpuKernelGen(Kparams);
+
+	err = cudaMemset(Kparams.L1S2, 0, mpCnt * Kparams.BlockSize * 8);
+	if (err != cudaSuccess)
+		return false;
+	cudaMemset(Kparams.dbg_buf, 0, 1024);
+	cudaMemset(Kparams.LoopTable, 0, KangCnt * MD_LEN * sizeof(u64));
+	return true;
+}
+
+#ifdef DEBUG_MODE
+int RCGpuKang::Dbg_CheckKangs()
+{
+	int kang_size = mpCnt * Kparams.BlockSize * Kparams.GroupCnt * 96;
+	u64* kangs = (u64*)malloc(kang_size);
+	cudaError_t err = cudaMemcpy(kangs, Kparams.Kangs, kang_size, cudaMemcpyDeviceToHost);
+	int res = 0;
+	for (int i = 0; i < KangCnt; i++)
+	{
+		EcPoint Pnt, p;
+		Pnt.LoadFromBuffer64((u8*)&kangs[i * 12 + 0]);
+		EcInt dist;
+		dist.Set(0);
+		memcpy(dist.data, &kangs[i * 12 + 8], 24);
+		bool neg = false;
+		if (dist.data[2] >> 63)
+		{
+			neg = true;
+			memset(((u8*)dist.data) + 24, 0xFF, 16);
+			dist.Neg();
+		}
+		p = ec.MultiplyG_Fast(dist);
+		if (neg)
+			p.y.NegModP();
+		if (i < KangCnt / 3)
+			p = p;
+		else
+			if (i < 2 * KangCnt / 3)
+				p = ec.AddPoints(PntA, p);
+			else
+				p = ec.AddPoints(PntB, p);
+		if (!p.IsEqual(Pnt))
+			res++;
+	}
+	free(kangs);
+	return res;
+}
+#endif
+
+extern u32 gTotalErrors;
+
+//executes in separate thread
+void RCGpuKang::Execute()
+{
+	cudaSetDevice(CudaIndex);
+
+	if (!Start())
+	{
+		gTotalErrors++;
+		return;
+	}
+#ifdef DEBUG_MODE
+	u64 iter = 1;
+#endif
+	cudaError_t err;	
+	while (!StopFlag)
+	{
+		u64 t1 = GetTickCount64();
+		cudaMemset(Kparams.DPs_out, 0, 4);
+		cudaMemset(Kparams.DPTable, 0, KangCnt * sizeof(u32));
+		cudaMemset(Kparams.LoopedKangs, 0, 8);
+		CallGpuKernelABC(Kparams);
+		int cnt;
+		err = cudaMemcpy(&cnt, Kparams.DPs_out, 4, cudaMemcpyDeviceToHost);
+		if (err != cudaSuccess)
+		{
+			printf("GPU %d, CallGpuKernel failed: %s\r\n", CudaIndex, cudaGetErrorString(err));
+			gTotalErrors++;
+			break;
+		}
+		
+		if (cnt >= MAX_DP_CNT)
+		{
+			cnt = MAX_DP_CNT;
+			printf("GPU %d, gpu DP buffer overflow, some points lost, increase DP value!\r\n", CudaIndex);
+		}
+		u64 pnt_cnt = (u64)KangCnt * STEP_CNT;
+
+		if (cnt)
+		{
+			err = cudaMemcpy(DPs_out, Kparams.DPs_out + 4, cnt * GPU_DP_SIZE, cudaMemcpyDeviceToHost);
+			if (err != cudaSuccess)
+			{
+				gTotalErrors++;
+				break;
+			}
+			AddPointsToList(DPs_out, cnt, (u64)KangCnt * STEP_CNT);
+		}
+
+		//dbg
+		cudaMemcpy(dbg, Kparams.dbg_buf, 1024, cudaMemcpyDeviceToHost);
+
+		u32 lcnt;
+		cudaMemcpy(&lcnt, Kparams.LoopedKangs, 4, cudaMemcpyDeviceToHost);
+		//printf("GPU %d, Looped: %d\r\n", CudaIndex, lcnt);
+
+		u64 t2 = GetTickCount64();
+		u64 tm = t2 - t1;
+		if (!tm)
+			tm = 1;
+		int cur_speed = (int)(pnt_cnt / (tm * 1000));
+		//printf("GPU %d kernel time %d ms, speed %d MH\r\n", CudaIndex, (int)tm, cur_speed);
+
+		SpeedStats[cur_stats_ind] = cur_speed;
+		cur_stats_ind = (cur_stats_ind + 1) % STATS_WND_SIZE;
+
+#ifdef DEBUG_MODE
+		if ((iter % 300) == 0)
+		{
+			int corr_cnt = Dbg_CheckKangs();
+			if (corr_cnt)
+			{
+				printf("DBG: GPU %d, KANGS CORRUPTED: %d\r\n", CudaIndex, corr_cnt);
+				gTotalErrors++;
+			}
+			else
+				printf("DBG: GPU %d, ALL KANGS OK!\r\n", CudaIndex);
+		}
+		iter++;
+#endif
+	}
+
+	Release();
+}
+
+int RCGpuKang::GetStatsSpeed()
+{
+	int res = SpeedStats[0];
+	for (int i = 1; i < STATS_WND_SIZE; i++)
+		res += SpeedStats[i];
+	return res / STATS_WND_SIZE;
 }
\ No newline at end of file
diff --git a/LICENSE.TXT b/LICENSE.TXT
deleted file mode 100644
index 94a9ed0..0000000
--- a/LICENSE.TXT
+++ /dev/null
@@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/Makefile b/Makefile
index 8aa0b60..8cf0beb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,29 +1,85 @@
-CC := g++
-NVCC := /usr/local/cuda-12.0/bin/nvcc
-CUDA_PATH ?= /usr/local/cuda-12.0
-
-CCFLAGS := -O3 -I$(CUDA_PATH)/include
-NVCCFLAGS := -O3 -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_75,code=compute_75 -gencode=arch=compute_61,code=compute_61
-LDFLAGS := -L$(CUDA_PATH)/lib64 -lcudart -pthread
-
-CPU_SRC := RCKangaroo.cpp GpuKang.cpp Ec.cpp utils.cpp
-GPU_SRC := RCGpuCore.cu
-
-CPP_OBJECTS := $(CPU_SRC:.cpp=.o)
-CU_OBJECTS := $(GPU_SRC:.cu=.o)
-
-TARGET := rckangaroo
-
-all: $(TARGET)
-
-$(TARGET): $(CPP_OBJECTS) $(CU_OBJECTS)
-	$(CC) $(CCFLAGS) -o $@ $^ $(LDFLAGS)
-
-%.o: %.cpp
-	$(CC) $(CCFLAGS) -c $< -o $@
-
-%.o: %.cu
-	$(NVCC) $(NVCCFLAGS) -c $< -o $@
-
-clean:
-	rm -f $(CPP_OBJECTS) $(CU_OBJECTS)
+# RCKangaroo Makefile (robusto y autodetecta CUDA)
+# Uso:
+#   make clean
+#   make SM=86 USE_JACOBIAN=1 PROFILE=release -j
+#   ./rckangaroo -h
+
+TARGET := rckangaroo
+
+# Toolchains
+CC    := g++
+NVCC  := /usr/bin/nvcc
+
+# CUDA
+CUDA_PATH ?= /usr/local/cuda-12.0
+SM        ?= 86
+USE_JACOBIAN ?= 1
+PROFILE   ?= release
+
+# Optimización separada: host vs device
+HOST_COPT_release := -O3 -DNDEBUG -ffunction-sections -fdata-sections
+HOST_COPT_debug   := -O0 -g
+HOST_COPT := $(HOST_COPT_$(PROFILE))
+
+DEV_COPT_release := -O3
+DEV_COPT_debug   := -O0 -g
+DEV_COPT := $(DEV_COPT_$(PROFILE))
+
+# Flags
+CCFLAGS    := -std=c++17 -I$(CUDA_PATH)/include $(HOST_COPT) -DUSE_JACOBIAN=$(USE_JACOBIAN)
+NVCCFLAGS  := -std=c++17 -arch=sm_$(SM) $(DEV_COPT) -Xptxas -O3 -Xptxas -dlcm=ca -Xfatbin=-compress-all -DUSE_JACOBIAN=$(USE_JACOBIAN)
+NVCCXCOMP  := -Xcompiler -ffunction-sections -Xcompiler -fdata-sections
+
+LDFLAGS   := -L$(CUDA_PATH)/lib64 -lcudart -pthread
+
+# Fuentes
+SRC_CPP := RCKangaroo.cpp GpuKang.cpp Ec.cpp utils.cpp
+
+# Directorio donde está el .cu (por defecto, raíz)
+CU_DIR ?= .
+SRC_CU := $(wildcard $(CU_DIR)/RCGpuCore.cu)
+
+OBJ_CPP := $(SRC_CPP:.cpp=.o)
+OBJ_CU  := $(patsubst %.cu,%.o,$(SRC_CU))
+
+ifeq ($(strip $(OBJ_CU)),)
+  $(warning [Makefile] No se encontró RCGpuCore.cu en $(CU_DIR). Se construirá solo CPU.)
+  OBJS := $(OBJ_CPP)
+else
+  OBJS := $(OBJ_CPP) $(OBJ_CU)
+endif
+
+.PHONY: all clean print-vars
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS)
+	$(CC) $(CCFLAGS) -o $@ $(OBJS) $(LDFLAGS)
+
+%.o: %.cpp
+	$(CC) $(CCFLAGS) -c $< -o $@
+
+# Regla genérica CUDA (.cu -> .o) con flags host vía -Xcompiler
+$(CU_DIR)/%.o: $(CU_DIR)/%.cu
+	$(NVCC) $(NVCCFLAGS) $(NVCCXCOMP) -c $< -o $@
+
+# Regla explícita (por si tu make ignora patrones)
+$(CU_DIR)/RCGpuCore.o: $(CU_DIR)/RCGpuCore.cu RCGpuUtils.h Ec.h defs.h
+	$(NVCC) $(NVCCFLAGS) $(NVCCXCOMP) -c $< -o $@
+
+clean:
+	rm -f $(OBJ_CPP) $(OBJ_CU) $(TARGET)
+
+print-vars:
+	@echo "CUDA_PATH=$(CUDA_PATH)"
+	@echo "SM=$(SM)"
+	@echo "USE_JACOBIAN=$(USE_JACOBIAN)"
+	@echo "PROFILE=$(PROFILE)"
+	@echo "SRC_CPP=$(SRC_CPP)"
+	@echo "CU_DIR=$(CU_DIR)"
+	@echo "SRC_CU=$(SRC_CU)"
+	@echo "OBJ_CPP=$(OBJ_CPP)"
+	@echo "OBJ_CU=$(OBJ_CU)"
+	@echo "OBJS=$(OBJS)"
+	@echo "NVCCFLAGS=$(NVCCFLAGS)"
+	@echo "NVCCXCOMP=$(NVCCXCOMP)"
diff --git a/RCGpuCore.cu b/RCGpuCore.cu
index 4556a5c..c96a981 100644
--- a/RCGpuCore.cu
+++ b/RCGpuCore.cu
@@ -28,6 +28,11 @@ extern __shared__ u64 LDS[];
 
 //this kernel performs main jumps
 extern "C" __launch_bounds__(BLOCK_SIZE, 1)
+
+// === Jacobian option (experimental) =========================================
+// Para habilitar: compilar con -DUSE_JACOBIAN=1. Mantiene el formato de salida.
+// Implementa sumas J+A y conversión por lotes sólo para la verificación DP.
+// ============================================================================
 __global__ void KernelA(const TKparams Kparams)
 {
 	u64* L2x = Kparams.L2 + 2 * THREAD_X + 4 * BLOCK_SIZE * BLOCK_X;
@@ -79,6 +84,73 @@ __global__ void KernelA(const TKparams Kparams)
 
     for (int step_ind = 0; step_ind < STEP_CNT; step_ind++)
     {
+#if USE_JACOBIAN
+        // [Jacobian path] — usamos L2x=L2X, L2y=L2Y, L2s=L2Z
+        __align__(16) u64 X[4], Y[4], Z[4];
+        // 1) Cargar Z=1 si es la primera iteración
+        for (int group = 0; group < PNT_GROUP_CNT; group++) {
+            LOAD_VAL_256(X, L2x, group);
+            LOAD_VAL_256(Y, L2y, group);
+            if (step_ind==0) { Z[0]=1; Z[1]=Z[2]=Z[3]=0; }
+            else { LOAD_VAL_256(Z, L2s, group); }
+            // salto
+            u64* jmp_table;
+            __align__(16) u64 jmp_x[4], jmp_y[4];
+            u16 jmp_ind_loc = X[0] % JMP_CNT;
+            jmp_table = ((L1S2 >> group) & 1) ? jmp2_table : jmp1_table;
+            Copy_int4_x2(jmp_x, jmp_table + 8 * jmp_ind_loc);
+            Copy_int4_x2(jmp_y, jmp_table + 8 * jmp_ind_loc + 4);
+            // manejo de inversión de y según bit
+            u32 inv_flag = (u32)Y[0] & 1u;
+            if (inv_flag) { jmp_ind_loc |= INV_FLAG; NegModP(jmp_y); }
+            u64 X3[4],Y3[4],Z3[4];
+            JacobianAddMixed(X3,Y3,Z3,X,Y,Z,jmp_x,jmp_y);
+            SAVE_VAL_256(L2x, X3, group);
+            SAVE_VAL_256(L2y, Y3, group);
+            SAVE_VAL_256(L2s, Z3, group);
+
+            // Conversión batched a afín sólo para DP (una inversión total por hilo)
+        }
+        // Batch inversion: Z^-1 para todos los grupos (Montgomery trick)
+        __align__(16) u64 prod[4] = {1,0,0,0};
+        for (int group=0; group<PNT_GROUP_CNT; group++) {
+            LOAD_VAL_256(Z, L2s, group);
+            MulModP(prod, prod, Z);
+            SAVE_VAL_256(L2s, prod, group); // prefijos
+        }
+        InvModP((u32*)prod); // prod = (∏Z)^-1
+        for (int group=PNT_GROUP_CNT-1; group>=0; group--) {
+            __align__(16) u64 prev[4] = {1,0,0,0};
+            if (group>0) LOAD_VAL_256(prev, L2s, group-1);
+            LOAD_VAL_256(Z, L2s, group);
+            __align__(16) u64 invZ[4]; MulModP(invZ, prod, prev); // Z_i^-1
+            // actualizar prod = prod * Z_i
+            MulModP(prod, prod, Z);
+            // x_aff = X * invZ^2
+            __align__(16) u64 Xg[4], invZ2[4];
+            LOAD_VAL_256(Xg, L2x, group);
+            MulModP(invZ2, invZ, invZ);
+            MulModP(Xg, Xg, invZ2);
+            // check DP
+            if ((Xg[3] & dp_mask64) == 0) {
+                u32 kang_ind = (THREAD_X + BLOCK_X * BLOCK_SIZE) * PNT_GROUP_CNT + group;
+                u32 ind = atomicAdd(Kparams.DPTable + kang_ind, 1);
+                ind = min(ind, DPTABLE_MAX_CNT - 1);
+                int4 *pdst = (int4*)(&Kparams.DPs_out[(u64)kang_ind * DPTABLE_MAX_CNT * 8 + 8 * ind]);
+                ((int4*)pdst)[0] = ((int4*)Xg)[0];
+                ((int4*)pdst)[1] = ((int4*)Xg)[1];
+                // acción de salto (igual al camino original)
+                // NOTA: reusamos jmp_ind_loc de arriba no disponible aquí; reconstituimos:
+                LOAD_VAL_256(X, L2x, group);
+                u16 ji = X[0] % JMP_CNT;
+                ji |= ((u32)Y[0] & 1u) ? 0 : INV_FLAG;
+                ((u16*)pdst)[8+0] = ji; // almacena índice salto+flags
+                ((u16*)pdst)[8+1] = 0;
+            }
+        }
+        continue; // saltamos el camino afín original
+#endif // USE_JACOBIAN
+
         __align__(16) u64 inverse[5];
 		u64* jmp_table;
 		__align__(16) u64 jmp_x[4];
@@ -532,12 +604,61 @@ __device__ __forceinline__ bool ProcessJumpDistance(u32 step_ind, u32 d_cur, u64
 	table[iter] = d[0];
 	*cur_ind = (iter + 1) % MD_LEN;
 
-	if (found_ind < 0)
-	{		
-		if (d_cur & DP_FLAG)
-			BuildDP(Kparams, kang_ind, d);
-		return false;
-	}
+	// --- Warp-aggregated emisión de DPs (reemplaza BuildDP) ---
+if (found_ind < 0)
+{
+    // 1) Cada hilo decide si «quiere emitir» un DP en este paso
+    //    (antes llamábamos a BuildDP)
+    bool emit = false;
+    int4 rx_local;  // X parcial del DP (lo que lectura BuildDP hacía)
+    if (d_cur & DP_FLAG) {
+        // Replicamos la parte de BuildDP que obtiene el X guardado en DPTable
+        int idx = atomicAdd(Kparams.DPTable + kang_ind, 0x10000);
+        idx >>= 16;  // índice de lectura (alto de 16 bits)
+        if (idx < DPTABLE_MAX_CNT) {
+            rx_local = *(int4*)(Kparams.DPTable
+                       + Kparams.KangCnt
+                       + (kang_ind * DPTABLE_MAX_CNT + idx) * 4);
+            emit = true;
+        }
+    }
+
+    // 2) Warp-aggregated atomic: una sola atomicAdd por warp
+    const unsigned active = __activemask();
+    const int lane = threadIdx.x & 31;
+    const int leader = __ffs(active) - 1;              // primer lane activo
+    unsigned mask_emit = __ballot_sync(active, emit);  // quiénes emiten en el warp
+    int hits = __popc(mask_emit);                      // cuántos DPs emiten
+
+    if (hits) {
+        // Sólo el líder reserva 'hits' slots contiguos en la salida
+        unsigned base = 0;
+        if (lane == leader) {
+            base = atomicAdd(Kparams.DPs_out, (unsigned)hits);
+        }
+        // Broadcast del 'base' al warp
+        base = __shfl_sync(active, base, leader);
+
+        if (emit) {
+            // Posición compacta de este hilo dentro del bloque reservado
+            unsigned laneMask = mask_emit & ((1u << lane) - 1u);
+            unsigned pos = __popc(laneMask);
+
+            unsigned outIdx = base + pos;
+            outIdx = min(outIdx, (unsigned)(MAX_DP_CNT - 1));
+
+            // Escribir el registro exactamente igual que BuildDP
+            u32* DPs = Kparams.DPs_out + 4 + outIdx * (GPU_DP_SIZE / 4);
+            *(int4*)&DPs[0] = rx_local;              // X parcial (int4)
+            *(int4*)&DPs[4] = ((int4*)d)[0];         // dist[0..1] (128 bits)
+            *(u64*)&DPs[8] = d[2];                   // dist[2]   (64  bits)
+            DPs[10] = 3 * kang_ind / Kparams.KangCnt; // tipo de kanguro
+        }
+    }
+
+    return false;
+}
+
 
 	u32 LoopSize = (iter + MD_LEN - found_ind) % MD_LEN;
 	if (!LoopSize)
diff --git a/RCGpuUtils.h b/RCGpuUtils.h
index 6c5db83..8bf4e2e 100644
--- a/RCGpuUtils.h
+++ b/RCGpuUtils.h
@@ -519,7 +519,7 @@ __device__ __forceinline__ void InvModP(u32* res)
 	__align__(8) u32 modp[9];
 	__align__(8) u32 val[9];
 	__align__(8) u32 a[9];
-	__align__(8) u32 tmp[4][9+1]; //+1 because we need alignment 64bit for tmp[>0]
+	__align__(8) u32 tmp[4][9];
 
 	((u64*)modp)[0] = P_0;
 	((u64*)modp)[1] = P_123;
@@ -628,3 +628,53 @@ __device__ __forceinline__ void InvModP(u32* res)
 		sub_288_P(res);
 }
 
+// === Extensiones: utilidades Jacobianas (GPU) ================================
+__device__ __forceinline__ void SquareModP(u64 *res, u64 *a) {
+	MulModP(res, a, a);
+}
+
+// r = 2*p  (doblado jacobiano, a=0)
+__device__ __forceinline__ void JacobianDouble(u64 *X3,u64 *Y3,u64 *Z3,const u64 *X1,const u64 *Y1,const u64 *Z1){
+	u64 XX[4],YY[4],YYYY[4],S[4],M[4],T[4],twoS[4],Z3tmp[4];
+	Copy_u64_x4(XX,(void*)X1); MulModP(XX,XX,XX);
+	Copy_u64_x4(YY,(void*)Y1); MulModP(YY,YY,YY);
+	Copy_u64_x4(YYYY,YY);      MulModP(YYYY,YYYY,YY);
+	Copy_u64_x4(S,(void*)X1);  MulModP(S,S,YY); AddModP(S,S,S); // 2*X1*Y1^2
+	Copy_u64_x4(M,XX); AddModP(M,M,XX); AddModP(M,M,XX);        // 3*X1^2
+	Copy_u64_x4(T,M);  MulModP(T,T,M);                           // M^2
+	Copy_u64_x4(twoS,S); AddModP(twoS,twoS,S);                   // 2*S
+	SubModP((u64*)X3,T,twoS);                                    // X3
+	u64 V[4]; Copy_u64_x4(V,S); SubModP(V,V,(u64*)X3); MulModP(V,V,M);
+	u64 eightYYYY[4]; Copy_u64_x4(eightYYYY,YYYY);
+	for (int i=0;i<3;i++) AddModP(eightYYYY,eightYYYY,YYYY); // *8
+	SubModP((u64*)Y3,V,eightYYYY);
+	Copy_u64_x4(Z3tmp,(void*)Y1); AddModP(Z3tmp,Z3tmp,(u64*)Y1); MulModP(Z3tmp,Z3tmp,(u64*)Z1);
+	Copy_u64_x4((u64*)Z3,Z3tmp);
+}
+
+// r = p + q_aff  (mixta)
+__device__ __forceinline__ void JacobianAddMixed(u64 *X3,u64 *Y3,u64 *Z3,const u64 *X1,const u64 *Y1,const u64 *Z1,const u64 *Qx,const u64 *Qy){
+	u64 Z2[4],Z3_[4],U2[4],S2[4],H[4],R[4];
+	Copy_u64_x4(Z2,(void*)Z1); MulModP(Z2,Z2,Z2);
+	Copy_u64_x4(Z3_,Z2);       MulModP(Z3_,Z3_,(u64*)Z1);
+	Copy_u64_x4(U2,(void*)Qx); MulModP(U2,U2,Z2);
+	Copy_u64_x4(S2,(void*)Qy); MulModP(S2,S2,Z3_);
+	Copy_u64_x4(H,U2); SubModP(H,H,(u64*)X1);
+	Copy_u64_x4(R,S2); SubModP(R,R,(u64*)Y1);
+	// if H==0 -> degenerate (omit; improbable en saltos aleatorios)
+	u64 HH[4],HHH[4],V[4],X3_[4],Y3_[4],Z3tmp[4];
+	Copy_u64_x4(HH,H);  MulModP(HH,HH,H);
+	Copy_u64_x4(HHH,HH);MulModP(HHH,HHH,H);
+	Copy_u64_x4(V,(void*)X1);  MulModP(V,V,HH);
+	Copy_u64_x4(X3_,R); MulModP(X3_,X3_,R);
+	SubModP(X3_,X3_,HHH);
+	u64 twoV[4]; Copy_u64_x4(twoV,V); AddModP(twoV,twoV,V);
+	SubModP((u64*)X3,X3_,twoV);
+	Copy_u64_x4(Y3_,V); SubModP(Y3_,Y3_,(u64*)X3); MulModP(Y3_,Y3_,R);
+	u64 S1HHH[4]; Copy_u64_x4(S1HHH,(void*)Y1); MulModP(S1HHH,S1HHH,HHH);
+	SubModP((u64*)Y3,Y3_,S1HHH);
+	Copy_u64_x4(Z3tmp,(void*)Z1); MulModP(Z3tmp,Z3tmp,H);
+	Copy_u64_x4((u64*)Z3,Z3tmp);
+}
+// ============================================================================
+
diff --git a/RCKangaroo.cpp b/RCKangaroo.cpp
index 19c0fae..605c470 100644
--- a/RCKangaroo.cpp
+++ b/RCKangaroo.cpp
@@ -1,764 +1,789 @@
-// This file is a part of RCKangaroo software
-// (c) 2024, RetiredCoder (RC)
-// License: GPLv3, see "LICENSE.TXT" file
-// https://github.com/RetiredC
-
-
-#include <iostream>
-#include <vector>
-
-#include "cuda_runtime.h"
-#include "cuda.h"
-
-#include "defs.h"
-#include "utils.h"
-#include "GpuKang.h"
-
-
-EcJMP EcJumps1[JMP_CNT];
-EcJMP EcJumps2[JMP_CNT];
-EcJMP EcJumps3[JMP_CNT];
-
-RCGpuKang* GpuKangs[MAX_GPU_CNT];
-int GpuCnt;
-volatile long ThrCnt;
-volatile bool gSolved;
-
-EcInt Int_HalfRange;
-EcPoint Pnt_HalfRange;
-EcPoint Pnt_NegHalfRange;
-EcInt Int_TameOffset;
-Ec ec;
-
-CriticalSection csAddPoints;
-u8* pPntList;
-u8* pPntList2;
-volatile int PntIndex;
-TFastBase db;
-EcPoint gPntToSolve;
-EcInt gPrivKey;
-
-volatile u64 TotalOps;
-u32 TotalSolved;
-u32 gTotalErrors;
-u64 PntTotalOps;
-bool IsBench;
-
-u32 gDP;
-u32 gRange;
-EcInt gStart;
-bool gStartSet;
-EcPoint gPubKey;
-u8 gGPUs_Mask[MAX_GPU_CNT];
-char gTamesFileName[1024];
-double gMax;
-bool gGenMode; //tames generation mode
-bool gIsOpsLimit;
-
-#pragma pack(push, 1)
-struct DBRec
-{
-	u8 x[12];
-	u8 d[22];
-	u8 type; //0 - tame, 1 - wild1, 2 - wild2
-};
-#pragma pack(pop)
-
-void InitGpus()
-{
-	GpuCnt = 0;
-	int gcnt = 0;
-	cudaGetDeviceCount(&gcnt);
-	if (gcnt > MAX_GPU_CNT)
-		gcnt = MAX_GPU_CNT;
-
-//	gcnt = 1; //dbg
-	if (!gcnt)
-		return;
-
-	int drv, rt;
-	cudaRuntimeGetVersion(&rt);
-	cudaDriverGetVersion(&drv);
-	char drvver[100];
-	sprintf(drvver, "%d.%d/%d.%d", drv / 1000, (drv % 100) / 10, rt / 1000, (rt % 100) / 10);
-
-	printf("CUDA devices: %d, CUDA driver/runtime: %s\r\n", gcnt, drvver);
-	cudaError_t cudaStatus;
-	for (int i = 0; i < gcnt; i++)
-	{
-		cudaStatus = cudaSetDevice(i);
-		if (cudaStatus != cudaSuccess)
-		{
-			printf("cudaSetDevice for gpu %d failed!\r\n", i);
-			continue;
-		}
-
-		if (!gGPUs_Mask[i])
-			continue;
-
-		cudaDeviceProp deviceProp;
-		cudaGetDeviceProperties(&deviceProp, i);
-		printf("GPU %d: %s, %.2f GB, %d CUs, cap %d.%d, PCI %d, L2 size: %d KB\r\n", i, deviceProp.name, ((float)(deviceProp.totalGlobalMem / (1024 * 1024))) / 1024.0f, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor, deviceProp.pciBusID, deviceProp.l2CacheSize / 1024);
-		
-		if (deviceProp.major < 6)
-		{
-			printf("GPU %d - not supported, skip\r\n", i);
-			continue;
-		}
-
-		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-
-		GpuKangs[GpuCnt] = new RCGpuKang();
-		GpuKangs[GpuCnt]->CudaIndex = i;
-		GpuKangs[GpuCnt]->persistingL2CacheMaxSize = deviceProp.persistingL2CacheMaxSize;
-		GpuKangs[GpuCnt]->mpCnt = deviceProp.multiProcessorCount;
-		GpuKangs[GpuCnt]->IsOldGpu = deviceProp.l2CacheSize < 16 * 1024 * 1024;
-		GpuCnt++;
-	}
-	printf("Total GPUs for work: %d\r\n", GpuCnt);
-}
-#ifdef _WIN32
-u32 __stdcall kang_thr_proc(void* data)
-{
-	RCGpuKang* Kang = (RCGpuKang*)data;
-	Kang->Execute();
-	InterlockedDecrement(&ThrCnt);
-	return 0;
-}
-#else
-void* kang_thr_proc(void* data)
-{
-	RCGpuKang* Kang = (RCGpuKang*)data;
-	Kang->Execute();
-	__sync_fetch_and_sub(&ThrCnt, 1);
-	return 0;
-}
-#endif
-void AddPointsToList(u32* data, int pnt_cnt, u64 ops_cnt)
-{
-	csAddPoints.Enter();
-	if (PntIndex + pnt_cnt >= MAX_CNT_LIST)
-	{
-		csAddPoints.Leave();
-		printf("DPs buffer overflow, some points lost, increase DP value!\r\n");
-		return;
-	}
-	memcpy(pPntList + GPU_DP_SIZE * PntIndex, data, pnt_cnt * GPU_DP_SIZE);
-	PntIndex += pnt_cnt;
-	PntTotalOps += ops_cnt;
-	csAddPoints.Leave();
-}
-
-bool Collision_SOTA(EcPoint& pnt, EcInt t, int TameType, EcInt w, int WildType, bool IsNeg)
-{
-	if (IsNeg)
-		t.Neg();
-	if (TameType == TAME)
-	{
-		gPrivKey = t;
-		gPrivKey.Sub(w);
-		EcInt sv = gPrivKey;
-		gPrivKey.Add(Int_HalfRange);
-		EcPoint P = ec.MultiplyG(gPrivKey);
-		if (P.IsEqual(pnt))
-			return true;
-		gPrivKey = sv;
-		gPrivKey.Neg();
-		gPrivKey.Add(Int_HalfRange);
-		P = ec.MultiplyG(gPrivKey);
-		return P.IsEqual(pnt);
-	}
-	else
-	{
-		gPrivKey = t;
-		gPrivKey.Sub(w);
-		if (gPrivKey.data[4] >> 63)
-			gPrivKey.Neg();
-		gPrivKey.ShiftRight(1);
-		EcInt sv = gPrivKey;
-		gPrivKey.Add(Int_HalfRange);
-		EcPoint P = ec.MultiplyG(gPrivKey);
-		if (P.IsEqual(pnt))
-			return true;
-		gPrivKey = sv;
-		gPrivKey.Neg();
-		gPrivKey.Add(Int_HalfRange);
-		P = ec.MultiplyG(gPrivKey);
-		return P.IsEqual(pnt);
-	}
-}
-
-
-void CheckNewPoints()
-{
-	csAddPoints.Enter();
-	if (!PntIndex)
-	{
-		csAddPoints.Leave();
-		return;
-	}
-
-	int cnt = PntIndex;
-	memcpy(pPntList2, pPntList, GPU_DP_SIZE * cnt);
-	PntIndex = 0;
-	csAddPoints.Leave();
-
-	for (int i = 0; i < cnt; i++)
-	{
-		DBRec nrec;
-		u8* p = pPntList2 + i * GPU_DP_SIZE;
-		memcpy(nrec.x, p, 12);
-		memcpy(nrec.d, p + 16, 22);
-		nrec.type = gGenMode ? TAME : p[40];
-
-		DBRec* pref = (DBRec*)db.FindOrAddDataBlock((u8*)&nrec);
-		if (gGenMode)
-			continue;
-		if (pref)
-		{
-			//in db we dont store first 3 bytes so restore them
-			DBRec tmp_pref;
-			memcpy(&tmp_pref, &nrec, 3);
-			memcpy(((u8*)&tmp_pref) + 3, pref, sizeof(DBRec) - 3);
-			pref = &tmp_pref;
-
-			if (pref->type == nrec.type)
-			{
-				if (pref->type == TAME)
-					continue;
-
-				//if it's wild, we can find the key from the same type if distances are different
-				if (*(u64*)pref->d == *(u64*)nrec.d)
-					continue;
-				//else
-				//	ToLog("key found by same wild");
-			}
-
-			EcInt w, t;
-			int TameType, WildType;
-			if (pref->type != TAME)
-			{
-				memcpy(w.data, pref->d, sizeof(pref->d));
-				if (pref->d[21] == 0xFF) memset(((u8*)w.data) + 22, 0xFF, 18);
-				memcpy(t.data, nrec.d, sizeof(nrec.d));
-				if (nrec.d[21] == 0xFF) memset(((u8*)t.data) + 22, 0xFF, 18);
-				TameType = nrec.type;
-				WildType = pref->type;
-			}
-			else
-			{
-				memcpy(w.data, nrec.d, sizeof(nrec.d));
-				if (nrec.d[21] == 0xFF) memset(((u8*)w.data) + 22, 0xFF, 18);
-				memcpy(t.data, pref->d, sizeof(pref->d));
-				if (pref->d[21] == 0xFF) memset(((u8*)t.data) + 22, 0xFF, 18);
-				TameType = TAME;
-				WildType = nrec.type;
-			}
-
-			bool res = Collision_SOTA(gPntToSolve, t, TameType, w, WildType, false) || Collision_SOTA(gPntToSolve, t, TameType, w, WildType, true);
-			if (!res)
-			{
-				bool w12 = ((pref->type == WILD1) && (nrec.type == WILD2)) || ((pref->type == WILD2) && (nrec.type == WILD1));
-				if (w12) //in rare cases WILD and WILD2 can collide in mirror, in this case there is no way to find K
-					;// ToLog("W1 and W2 collides in mirror");
-				else
-				{
-					printf("Collision Error\r\n");
-					gTotalErrors++;
-				}
-				continue;
-			}
-			gSolved = true;
-			break;
-		}
-	}
-}
-
-void ShowStats(u64 tm_start, double exp_ops, double dp_val)
-{
-#ifdef DEBUG_MODE
-	for (int i = 0; i <= MD_LEN; i++)
-	{
-		u64 val = 0;
-		for (int j = 0; j < GpuCnt; j++)
-		{
-			val += GpuKangs[j]->dbg[i];
-		}
-		if (val)
-			printf("Loop size %d: %llu\r\n", i, val);
-	}
-#endif
-
-	int speed = GpuKangs[0]->GetStatsSpeed();
-	for (int i = 1; i < GpuCnt; i++)
-		speed += GpuKangs[i]->GetStatsSpeed();
-
-	u64 est_dps_cnt = (u64)(exp_ops / dp_val);
-	u64 exp_sec = 0xFFFFFFFFFFFFFFFFull;
-	if (speed)
-		exp_sec = (u64)((exp_ops / 1000000) / speed); //in sec
-	u64 exp_days = exp_sec / (3600 * 24);
-	int exp_hours = (int)(exp_sec - exp_days * (3600 * 24)) / 3600;
-	int exp_min = (int)(exp_sec - exp_days * (3600 * 24) - exp_hours * 3600) / 60;
-
-	u64 sec = (GetTickCount64() - tm_start) / 1000;
-	u64 days = sec / (3600 * 24);
-	int hours = (int)(sec - days * (3600 * 24)) / 3600;
-	int min = (int)(sec - days * (3600 * 24) - hours * 3600) / 60;
-	 
-	printf("%sSpeed: %d MKeys/s, Err: %d, DPs: %lluK/%lluK, Time: %llud:%02dh:%02dm/%llud:%02dh:%02dm\r\n", gGenMode ? "GEN: " : (IsBench ? "BENCH: " : "MAIN: "), speed, gTotalErrors, db.GetBlockCnt()/1000, est_dps_cnt/1000, days, hours, min, exp_days, exp_hours, exp_min);
-}
-
-bool SolvePoint(EcPoint PntToSolve, int Range, int DP, EcInt* pk_res)
-{
-	if ((Range < 32) || (Range > 180))
-	{
-		printf("Unsupported Range value (%d)!\r\n", Range);
-		return false;
-	}
-	if ((DP < 14) || (DP > 60)) 
-	{
-		printf("Unsupported DP value (%d)!\r\n", DP);
-		return false;
-	}
-
-	printf("\r\nSolving point: Range %d bits, DP %d, start...\r\n", Range, DP);
-	double ops = 1.15 * pow(2.0, Range / 2.0);
-	double dp_val = (double)(1ull << DP);
-	double ram = (32 + 4 + 4) * ops / dp_val; //+4 for grow allocation and memory fragmentation
-	ram += sizeof(TListRec) * 256 * 256 * 256; //3byte-prefix table
-	ram /= (1024 * 1024 * 1024); //GB
-	printf("SOTA method, estimated ops: 2^%.3f, RAM for DPs: %.3f GB. DP and GPU overheads not included!\r\n", log2(ops), ram);
-	gIsOpsLimit = false;
-	double MaxTotalOps = 0.0;
-	if (gMax > 0)
-	{
-		MaxTotalOps = gMax * ops;
-		double ram_max = (32 + 4 + 4) * MaxTotalOps / dp_val; //+4 for grow allocation and memory fragmentation
-		ram_max += sizeof(TListRec) * 256 * 256 * 256; //3byte-prefix table
-		ram_max /= (1024 * 1024 * 1024); //GB
-		printf("Max allowed number of ops: 2^%.3f, max RAM for DPs: %.3f GB\r\n", log2(MaxTotalOps), ram_max);
-	}
-
-	u64 total_kangs = GpuKangs[0]->CalcKangCnt();
-	for (int i = 1; i < GpuCnt; i++)
-		total_kangs += GpuKangs[i]->CalcKangCnt();
-	double path_single_kang = ops / total_kangs;	
-	double DPs_per_kang = path_single_kang / dp_val;
-	printf("Estimated DPs per kangaroo: %.3f.%s\r\n", DPs_per_kang, (DPs_per_kang < 5) ? " DP overhead is big, use less DP value if possible!" : "");
-
-	if (!gGenMode && gTamesFileName[0])
-	{
-		printf("load tames...\r\n");
-		if (db.LoadFromFile(gTamesFileName))
-		{
-			printf("tames loaded\r\n");
-			if (db.Header[0] != gRange)
-			{
-				printf("loaded tames have different range, they cannot be used, clear\r\n");
-				db.Clear();
-			}
-		}
-		else
-			printf("tames loading failed\r\n");
-	}
-
-	SetRndSeed(0); //use same seed to make tames from file compatible
-	PntTotalOps = 0;
-	PntIndex = 0;
-//prepare jumps
-	EcInt minjump, t;
-	minjump.Set(1);
-	minjump.ShiftLeft(Range / 2 + 3);
-	for (int i = 0; i < JMP_CNT; i++)
-	{
-		EcJumps1[i].dist = minjump;
-		t.RndMax(minjump);
-		EcJumps1[i].dist.Add(t);
-		EcJumps1[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even
-		EcJumps1[i].p = ec.MultiplyG(EcJumps1[i].dist);
-	}
-
-	minjump.Set(1);
-	minjump.ShiftLeft(Range - 10); //large jumps for L1S2 loops. Must be almost RANGE_BITS
-	for (int i = 0; i < JMP_CNT; i++)
-	{
-		EcJumps2[i].dist = minjump;
-		t.RndMax(minjump);
-		EcJumps2[i].dist.Add(t);
-		EcJumps2[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even
-		EcJumps2[i].p = ec.MultiplyG(EcJumps2[i].dist);
-	}
-
-	minjump.Set(1);
-	minjump.ShiftLeft(Range - 10 - 2); //large jumps for loops >2
-	for (int i = 0; i < JMP_CNT; i++)
-	{
-		EcJumps3[i].dist = minjump;
-		t.RndMax(minjump);
-		EcJumps3[i].dist.Add(t);
-		EcJumps3[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even
-		EcJumps3[i].p = ec.MultiplyG(EcJumps3[i].dist);
-	}
-	SetRndSeed(GetTickCount64());
-
-	Int_HalfRange.Set(1);
-	Int_HalfRange.ShiftLeft(Range - 1);
-	Pnt_HalfRange = ec.MultiplyG(Int_HalfRange);
-	Pnt_NegHalfRange = Pnt_HalfRange;
-	Pnt_NegHalfRange.y.NegModP();
-	Int_TameOffset.Set(1);
-	Int_TameOffset.ShiftLeft(Range - 1);
-	EcInt tt;
-	tt.Set(1);
-	tt.ShiftLeft(Range - 5); //half of tame range width
-	Int_TameOffset.Sub(tt);
-	gPntToSolve = PntToSolve;
-
-//prepare GPUs
-	for (int i = 0; i < GpuCnt; i++)
-		if (!GpuKangs[i]->Prepare(PntToSolve, Range, DP, EcJumps1, EcJumps2, EcJumps3))
-		{
-			GpuKangs[i]->Failed = true;
-			printf("GPU %d Prepare failed\r\n", GpuKangs[i]->CudaIndex);
-		}
-
-	u64 tm0 = GetTickCount64();
-	printf("GPUs started...\r\n");
-
-#ifdef _WIN32
-	HANDLE thr_handles[MAX_GPU_CNT];
-#else
-	pthread_t thr_handles[MAX_GPU_CNT];
-#endif
-
-	u32 ThreadID;
-	gSolved = false;
-	ThrCnt = GpuCnt;
-	for (int i = 0; i < GpuCnt; i++)
-	{
-#ifdef _WIN32
-		thr_handles[i] = (HANDLE)_beginthreadex(NULL, 0, kang_thr_proc, (void*)GpuKangs[i], 0, &ThreadID);
-#else
-		pthread_create(&thr_handles[i], NULL, kang_thr_proc, (void*)GpuKangs[i]);
-#endif
-	}
-
-	u64 tm_stats = GetTickCount64();
-	while (!gSolved)
-	{
-		CheckNewPoints();
-		Sleep(10);
-		if (GetTickCount64() - tm_stats > 10 * 1000)
-		{
-			ShowStats(tm0, ops, dp_val);
-			tm_stats = GetTickCount64();
-		}
-
-		if ((MaxTotalOps > 0.0) && (PntTotalOps > MaxTotalOps))
-		{
-			gIsOpsLimit = true;
-			printf("Operations limit reached\r\n");
-			break;
-		}
-	}
-
-	printf("Stopping work ...\r\n");
-	for (int i = 0; i < GpuCnt; i++)
-		GpuKangs[i]->Stop();
-	while (ThrCnt)
-		Sleep(10);
-	for (int i = 0; i < GpuCnt; i++)
-	{
-#ifdef _WIN32
-		CloseHandle(thr_handles[i]);
-#else
-		pthread_join(thr_handles[i], NULL);
-#endif
-	}
-
-	if (gIsOpsLimit)
-	{
-		if (gGenMode)
-		{
-			printf("saving tames...\r\n");
-			db.Header[0] = gRange; 
-			if (db.SaveToFile(gTamesFileName))
-				printf("tames saved\r\n");
-			else
-				printf("tames saving failed\r\n");
-		}
-		db.Clear();
-		return false;
-	}
-
-	double K = (double)PntTotalOps / pow(2.0, Range / 2.0);
-	printf("Point solved, K: %.3f (with DP and GPU overheads)\r\n\r\n", K);
-	db.Clear();
-	*pk_res = gPrivKey;
-	return true;
-}
-
-bool ParseCommandLine(int argc, char* argv[])
-{
-	int ci = 1;
-	while (ci < argc)
-	{
-		char* argument = argv[ci];
-		ci++;
-		if (strcmp(argument, "-gpu") == 0)
-		{
-			if (ci >= argc)
-			{
-				printf("error: missed value after -gpu option\r\n");
-				return false;
-			}
-			char* gpus = argv[ci];
-			ci++;
-			memset(gGPUs_Mask, 0, sizeof(gGPUs_Mask));
-			for (int i = 0; i < (int)strlen(gpus); i++)
-			{
-				if ((gpus[i] < '0') || (gpus[i] > '9'))
-				{
-					printf("error: invalid value for -gpu option\r\n");
-					return false;
-				}
-				gGPUs_Mask[gpus[i] - '0'] = 1;
-			}
-		}
-		else
-		if (strcmp(argument, "-dp") == 0)
-		{
-			int val = atoi(argv[ci]);
-			ci++;
-			if ((val < 14) || (val > 60))
-			{
-				printf("error: invalid value for -dp option\r\n");
-				return false;
-			}
-			gDP = val;
-		}
-		else
-		if (strcmp(argument, "-range") == 0)
-		{
-			int val = atoi(argv[ci]);
-			ci++;
-			if ((val < 32) || (val > 170))
-			{
-				printf("error: invalid value for -range option\r\n");
-				return false;
-			}
-			gRange = val;
-		}
-		else
-		if (strcmp(argument, "-start") == 0)
-		{	
-			if (!gStart.SetHexStr(argv[ci]))
-			{
-				printf("error: invalid value for -start option\r\n");
-				return false;
-			}
-			ci++;
-			gStartSet = true;
-		}
-		else
-		if (strcmp(argument, "-pubkey") == 0)
-		{
-			if (!gPubKey.SetHexStr(argv[ci]))
-			{
-				printf("error: invalid value for -pubkey option\r\n");
-				return false;
-			}
-			ci++;
-		}
-		else
-		if (strcmp(argument, "-tames") == 0)
-		{
-			strcpy(gTamesFileName, argv[ci]);
-			ci++;
-		}
-		else
-		if (strcmp(argument, "-max") == 0)
-		{
-			double val = atof(argv[ci]);
-			ci++;
-			if (val < 0.001)
-			{
-				printf("error: invalid value for -max option\r\n");
-				return false;
-			}
-			gMax = val;
-		}
-		else
-		{
-			printf("error: unknown option %s\r\n", argument);
-			return false;
-		}
-	}
-	if (!gPubKey.x.IsZero())
-		if (!gStartSet || !gRange || !gDP)
-		{
-			printf("error: you must also specify -dp, -range and -start options\r\n");
-			return false;
-		}
-	if (gTamesFileName[0] && !IsFileExist(gTamesFileName))
-	{
-		if (gMax == 0.0)
-		{
-			printf("error: you must also specify -max option to generate tames\r\n");
-			return false;
-		}
-		gGenMode = true;
-	}
-	return true;
-}
-
-int main(int argc, char* argv[])
-{
-#ifdef _DEBUG	
-	_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
-#endif
-
-	printf("********************************************************************************\r\n");
-	printf("*                    RCKangaroo v3.0  (c) 2024 RetiredCoder                    *\r\n");
-	printf("********************************************************************************\r\n\r\n");
-
-	printf("This software is free and open-source: https://github.com/RetiredC\r\n");
-	printf("It demonstrates fast GPU implementation of SOTA Kangaroo method for solving ECDLP\r\n");
-
-#ifdef _WIN32
-	printf("Windows version\r\n");
-#else
-	printf("Linux version\r\n");
-#endif
-
-#ifdef DEBUG_MODE
-	printf("DEBUG MODE\r\n\r\n");
-#endif
-
-	InitEc();
-	gDP = 0;
-	gRange = 0;
-	gStartSet = false;
-	gTamesFileName[0] = 0;
-	gMax = 0.0;
-	gGenMode = false;
-	gIsOpsLimit = false;
-	memset(gGPUs_Mask, 1, sizeof(gGPUs_Mask));
-	if (!ParseCommandLine(argc, argv))
-		return 0;
-
-	InitGpus();
-
-	if (!GpuCnt)
-	{
-		printf("No supported GPUs detected, exit\r\n");
-		return 0;
-	}
-
-	pPntList = (u8*)malloc(MAX_CNT_LIST * GPU_DP_SIZE);
-	pPntList2 = (u8*)malloc(MAX_CNT_LIST * GPU_DP_SIZE);
-	TotalOps = 0;
-	TotalSolved = 0;
-	gTotalErrors = 0;
-	IsBench = gPubKey.x.IsZero();
-
-	if (!IsBench && !gGenMode)
-	{
-		printf("\r\nMAIN MODE\r\n\r\n");
-		EcPoint PntToSolve, PntOfs;
-		EcInt pk, pk_found;
-
-		PntToSolve = gPubKey;
-		if (!gStart.IsZero())
-		{
-			PntOfs = ec.MultiplyG(gStart);
-			PntOfs.y.NegModP();
-			PntToSolve = ec.AddPoints(PntToSolve, PntOfs);
-		}
-
-		char sx[100], sy[100];
-		gPubKey.x.GetHexStr(sx);
-		gPubKey.y.GetHexStr(sy);
-		printf("Solving public key\r\nX: %s\r\nY: %s\r\n", sx, sy);
-		gStart.GetHexStr(sx);
-		printf("Offset: %s\r\n", sx);
-
-		if (!SolvePoint(PntToSolve, gRange, gDP, &pk_found))
-		{
-			if (!gIsOpsLimit)
-				printf("FATAL ERROR: SolvePoint failed\r\n");
-			goto label_end;
-		}
-		pk_found.AddModP(gStart);
-		EcPoint tmp = ec.MultiplyG(pk_found);
-		if (!tmp.IsEqual(gPubKey))
-		{
-			printf("FATAL ERROR: SolvePoint found incorrect key\r\n");
-			goto label_end;
-		}
-		//happy end
-		char s[100];
-		pk_found.GetHexStr(s);
-		printf("\r\nPRIVATE KEY: %s\r\n\r\n", s);
-		FILE* fp = fopen("RESULTS.TXT", "a");
-		if (fp)
-		{
-			fprintf(fp, "PRIVATE KEY: %s\n", s);
-			fclose(fp);
-		}
-		else //we cannot save the key, show error and wait forever so the key is displayed
-		{
-			printf("WARNING: Cannot save the key to RESULTS.TXT!\r\n");
-			while (1)
-				Sleep(100);
-		}
-	}
-	else
-	{
-		if (gGenMode)
-			printf("\r\nTAMES GENERATION MODE\r\n");
-		else
-			printf("\r\nBENCHMARK MODE\r\n");
-		//solve points, show K
-		while (1)
-		{
-			EcInt pk, pk_found;
-			EcPoint PntToSolve;
-
-			if (!gRange)
-				gRange = 78;
-			if (!gDP)
-				gDP = 16;
-
-			//generate random pk
-			pk.RndBits(gRange);
-			PntToSolve = ec.MultiplyG(pk);
-
-			if (!SolvePoint(PntToSolve, gRange, gDP, &pk_found))
-			{
-				if (!gIsOpsLimit)
-					printf("FATAL ERROR: SolvePoint failed\r\n");
-				break;
-			}
-			if (!pk_found.IsEqual(pk))
-			{
-				printf("FATAL ERROR: Found key is wrong!\r\n");
-				break;
-			}
-			TotalOps += PntTotalOps;
-			TotalSolved++;
-			u64 ops_per_pnt = TotalOps / TotalSolved;
-			double K = (double)ops_per_pnt / pow(2.0, gRange / 2.0);
-			printf("Points solved: %d, average K: %.3f (with DP and GPU overheads)\r\n", TotalSolved, K);
-			//if (TotalSolved >= 100) break; //dbg
-		}
-	}
-label_end:
-	for (int i = 0; i < GpuCnt; i++)
-		delete GpuKangs[i];
-	DeInitEc();
-	free(pPntList2);
-	free(pPntList);
-}
-
+// This file is a part of RCKangaroo software
+// (c) 2024, RetiredCoder (RC)
+// License: GPLv3, see "LICENSE.TXT" file
+// https://github.com/RetiredC
+
+
+#include <iostream>
+#include <cstdlib>
+
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "cuda.h"
+
+#include "defs.h"
+#include "utils.h"
+#ifndef DB_REC_LEN
+#define DB_REC_LEN 32
+#endif
+#include "GpuKang.h"
+
+
+EcJMP EcJumps1[JMP_CNT];
+EcJMP EcJumps2[JMP_CNT];
+EcJMP EcJumps3[JMP_CNT];
+
+RCGpuKang* GpuKangs[MAX_GPU_CNT];
+int GpuCnt;
+volatile long ThrCnt;
+volatile bool gSolved;
+
+EcInt Int_HalfRange;
+EcPoint Pnt_HalfRange;
+EcPoint Pnt_NegHalfRange;
+EcInt Int_TameOffset;
+Ec ec;
+
+CriticalSection csAddPoints;
+u8* pPntList;
+u8* pPntList2;
+volatile int PntIndex;
+TFastBase db;
+EcPoint gPntToSolve;
+EcInt gPrivKey;
+
+volatile u64 TotalOps;
+u32 TotalSolved;
+u32 gTotalErrors;
+u64 PntTotalOps;
+bool IsBench;
+
+u32 gDP;
+u32 gRange;
+EcInt gStart;
+bool gStartSet;
+EcPoint gPubKey;
+u8 gGPUs_Mask[MAX_GPU_CNT];
+char gTamesFileName[1024];
+int gTameRatioPct = 33;
+int gTameBitsOffset = 4;
+double gMax;
+bool gGenMode; //tames generation mode
+bool gIsOpsLimit;
+
+#pragma pack(push, 1)
+struct DBRec
+{
+	u8 x[12];
+	u8 d[22];
+	u8 type; //0 - tame, 1 - wild1, 2 - wild2
+};
+#pragma pack(pop)
+
+void InitGpus()
+{
+	GpuCnt = 0;
+	int gcnt = 0;
+	cudaGetDeviceCount(&gcnt);
+	if (gcnt > MAX_GPU_CNT)
+		gcnt = MAX_GPU_CNT;
+
+//	gcnt = 1; //dbg
+	if (!gcnt)
+		return;
+
+	int drv, rt;
+	cudaRuntimeGetVersion(&rt);
+	cudaDriverGetVersion(&drv);
+	char drvver[100];
+	sprintf(drvver, "%d.%d/%d.%d", drv / 1000, (drv % 100) / 10, rt / 1000, (rt % 100) / 10);
+
+	printf("CUDA devices: %d, CUDA driver/runtime: %s\r\n", gcnt, drvver);
+	cudaError_t cudaStatus;
+	for (int i = 0; i < gcnt; i++)
+	{
+		cudaStatus = cudaSetDevice(i);
+		if (cudaStatus != cudaSuccess)
+		{
+			printf("cudaSetDevice for gpu %d failed!\r\n", i);
+			continue;
+		}
+
+		if (!gGPUs_Mask[i])
+			continue;
+
+		cudaDeviceProp deviceProp;
+		cudaGetDeviceProperties(&deviceProp, i);
+		printf("GPU %d: %s, %.2f GB, %d CUs, cap %d.%d, PCI %d, L2 size: %d KB\r\n", i, deviceProp.name, ((float)(deviceProp.totalGlobalMem / (1024 * 1024))) / 1024.0f, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor, deviceProp.pciBusID, deviceProp.l2CacheSize / 1024);
+		
+		if (deviceProp.major < 6)
+		{
+			printf("GPU %d - not supported, skip\r\n", i);
+			continue;
+		}
+
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+		GpuKangs[GpuCnt] = new RCGpuKang();
+		GpuKangs[GpuCnt]->CudaIndex = i;
+		GpuKangs[GpuCnt]->persistingL2CacheMaxSize = deviceProp.persistingL2CacheMaxSize;
+		GpuKangs[GpuCnt]->mpCnt = deviceProp.multiProcessorCount;
+		GpuKangs[GpuCnt]->IsOldGpu = true;
+		GpuCnt++;
+	}
+	printf("Total GPUs for work: %d\r\n", GpuCnt);
+}
+#ifdef _WIN32
+u32 __stdcall kang_thr_proc(void* data)
+{
+	RCGpuKang* Kang = (RCGpuKang*)data;
+	Kang->Execute();
+	InterlockedDecrement(&ThrCnt);
+	return 0;
+}
+#else
+void* kang_thr_proc(void* data)
+{
+	RCGpuKang* Kang = (RCGpuKang*)data;
+	Kang->Execute();
+	__sync_fetch_and_sub(&ThrCnt, 1);
+	return 0;
+}
+#endif
+void AddPointsToList(u32* data, int pnt_cnt, u64 ops_cnt)
+{
+	csAddPoints.Enter();
+	if (PntIndex + pnt_cnt >= MAX_CNT_LIST)
+	{
+		csAddPoints.Leave();
+		printf("DPs buffer overflow, some points lost, increase DP value!\r\n");
+		return;
+	}
+	memcpy(pPntList + GPU_DP_SIZE * PntIndex, data, pnt_cnt * GPU_DP_SIZE);
+	PntIndex += pnt_cnt;
+	PntTotalOps += ops_cnt;
+	csAddPoints.Leave();
+}
+
+bool Collision_SOTA(EcPoint& pnt, EcInt t, int TameType, EcInt w, int WildType, bool IsNeg)
+{
+	if (IsNeg)
+		t.Neg();
+	if (TameType == TAME)
+	{
+		gPrivKey = t;
+		gPrivKey.Sub(w);
+		EcInt sv = gPrivKey;
+		gPrivKey.Add(Int_HalfRange);
+		EcPoint P = ec.MultiplyG(gPrivKey);
+		if (P.IsEqual(pnt))
+			return true;
+		gPrivKey = sv;
+		gPrivKey.Neg();
+		gPrivKey.Add(Int_HalfRange);
+		P = ec.MultiplyG(gPrivKey);
+		return P.IsEqual(pnt);
+	}
+	else
+	{
+		gPrivKey = t;
+		gPrivKey.Sub(w);
+		if (gPrivKey.data[4] >> 63)
+			gPrivKey.Neg();
+		gPrivKey.ShiftRight(1);
+		EcInt sv = gPrivKey;
+		gPrivKey.Add(Int_HalfRange);
+		EcPoint P = ec.MultiplyG(gPrivKey);
+		if (P.IsEqual(pnt))
+			return true;
+		gPrivKey = sv;
+		gPrivKey.Neg();
+		gPrivKey.Add(Int_HalfRange);
+		P = ec.MultiplyG(gPrivKey);
+		return P.IsEqual(pnt);
+	}
+}
+
+
+void CheckNewPoints()
+{
+	csAddPoints.Enter();
+	if (!PntIndex)
+	{
+		csAddPoints.Leave();
+		return;
+	}
+
+	int cnt = PntIndex;
+	memcpy(pPntList2, pPntList, GPU_DP_SIZE * cnt);
+	PntIndex = 0;
+	csAddPoints.Leave();
+
+	for (int i = 0; i < cnt; i++)
+	{
+		DBRec nrec;
+		u8* p = pPntList2 + i * GPU_DP_SIZE;
+		memcpy(nrec.x, p, 12);
+		memcpy(nrec.d, p + 16, 22);
+		nrec.type = gGenMode ? TAME : p[40];
+
+		DBRec* pref = (DBRec*)db.FindOrAddDataBlock((u8*)&nrec);
+		if (gGenMode)
+			continue;
+		if (pref)
+		{
+			//in db we dont store first 3 bytes so restore them
+			DBRec tmp_pref;
+			memcpy(&tmp_pref, &nrec, 3);
+			size_t __rec_tail = (size_t)DB_REC_LEN;
+					 size_t __exp_tail = (size_t)sizeof(DBRec) - 3;
+					 size_t __cpy = __rec_tail < __exp_tail ? __rec_tail : __exp_tail;
+					 memcpy(((u8*)&tmp_pref) + 3, pref, __cpy);
+					 if (__cpy < __exp_tail) memset(((u8*)&tmp_pref) + 3 + __cpy, 0, __exp_tail - __cpy);
+			pref = &tmp_pref;
+
+			if (pref->type == nrec.type)
+			{
+				if (pref->type == TAME)
+					continue;
+
+				//if it's wild, we can find the key from the same type if distances are different
+				if (*(u64*)pref->d == *(u64*)nrec.d)
+					continue;
+				//else
+				//	ToLog("key found by same wild");
+			}
+
+			EcInt w, t;
+			int TameType, WildType;
+			if (pref->type != TAME)
+			{
+				memcpy(w.data, pref->d, sizeof(pref->d));
+				if (pref->d[21] == 0xFF) memset(((u8*)w.data) + 22, 0xFF, 18);
+				memcpy(t.data, nrec.d, sizeof(nrec.d));
+				if (nrec.d[21] == 0xFF) memset(((u8*)t.data) + 22, 0xFF, 18);
+				TameType = nrec.type;
+				WildType = pref->type;
+			}
+			else
+			{
+				memcpy(w.data, nrec.d, sizeof(nrec.d));
+				if (nrec.d[21] == 0xFF) memset(((u8*)w.data) + 22, 0xFF, 18);
+				memcpy(t.data, pref->d, sizeof(pref->d));
+				if (pref->d[21] == 0xFF) memset(((u8*)t.data) + 22, 0xFF, 18);
+				TameType = TAME;
+				WildType = nrec.type;
+			}
+
+			bool res = Collision_SOTA(gPntToSolve, t, TameType, w, WildType, false) || Collision_SOTA(gPntToSolve, t, TameType, w, WildType, true);
+			if (!res)
+			{
+				bool w12 = ((pref->type == WILD1) && (nrec.type == WILD2)) || ((pref->type == WILD2) && (nrec.type == WILD1));
+				if (w12) //in rare cases WILD and WILD2 can collide in mirror, in this case there is no way to find K
+					;// ToLog("W1 and W2 collides in mirror");
+				else
+				{
+					printf("Collision Error\r\n");
+					gTotalErrors++;
+				}
+				continue;
+			}
+			gSolved = true;
+			break;
+		}
+	}
+}
+
+void ShowStats(u64 tm_start, double exp_ops, double dp_val)
+{
+#ifdef DEBUG_MODE
+	for (int i = 0; i <= MD_LEN; i++)
+	{
+		u64 val = 0;
+		for (int j = 0; j < GpuCnt; j++)
+		{
+			val += GpuKangs[j]->dbg[i];
+		}
+		if (val)
+			printf("Loop size %d: %llu\r\n", i, val);
+	}
+#endif
+
+	int speed = GpuKangs[0]->GetStatsSpeed();
+	for (int i = 1; i < GpuCnt; i++)
+		speed += GpuKangs[i]->GetStatsSpeed();
+
+	u64 est_dps_cnt = (u64)(exp_ops / dp_val);
+	u64 exp_sec = 0xFFFFFFFFFFFFFFFFull;
+	if (speed)
+		exp_sec = (u64)((exp_ops / 1000000) / speed); //in sec
+	u64 exp_days = exp_sec / (3600 * 24);
+	int exp_hours = (int)(exp_sec - exp_days * (3600 * 24)) / 3600;
+	int exp_min = (int)(exp_sec - exp_days * (3600 * 24) - exp_hours * 3600) / 60;
+
+	u64 sec = (GetTickCount64() - tm_start) / 1000;
+	u64 days = sec / (3600 * 24);
+	int hours = (int)(sec - days * (3600 * 24)) / 3600;
+	int min = (int)(sec - days * (3600 * 24) - hours * 3600) / 60;
+	 
+	printf("%sSpeed: %d MKeys/s, Err: %d, DPs: %lluK/%lluK, Time: %llud:%02dh:%02dm/%llud:%02dh:%02dm\r\n", gGenMode ? "GEN: " : (IsBench ? "BENCH: " : "MAIN: "), speed, gTotalErrors, db.GetBlockCnt()/1000, est_dps_cnt/1000, days, hours, min, exp_days, exp_hours, exp_min);
+}
+
+bool SolvePoint(EcPoint PntToSolve, int Range, int DP, EcInt* pk_res)
+{
+	if ((Range < 32) || (Range > 180))
+	{
+		printf("Unsupported Range value (%d)!\r\n", Range);
+		return false;
+	}
+	if ((DP < 14) || (DP > 60)) 
+	{
+		printf("Unsupported DP value (%d)!\r\n", DP);
+		return false;
+	}
+
+	printf("\r\nSolving point: Range %d bits, DP %d, start...\r\n", Range, DP);
+	double ops = 1.15 * pow(2.0, Range / 2.0);
+	double dp_val = (double)(1ull << DP);
+	double ram = (32 + 4 + 4) * ops / dp_val; //+4 for grow allocation and memory fragmentation
+	ram += sizeof(TListRec) * 256 * 256 * 256; //3byte-prefix table
+	ram /= (1024 * 1024 * 1024); //GB
+	printf("SOTA method, estimated ops: 2^%.3f, RAM for DPs: %.3f GB. DP and GPU overheads not included!\r\n", log2(ops), ram);
+	gIsOpsLimit = false;
+	double MaxTotalOps = 0.0;
+	if (gMax > 0)
+	{
+		MaxTotalOps = gMax * ops;
+		double ram_max = (32 + 4 + 4) * MaxTotalOps / dp_val; //+4 for grow allocation and memory fragmentation
+		ram_max += sizeof(TListRec) * 256 * 256 * 256; //3byte-prefix table
+		ram_max /= (1024 * 1024 * 1024); //GB
+		printf("Max allowed number of ops: 2^%.3f, max RAM for DPs: %.3f GB\r\n", log2(MaxTotalOps), ram_max);
+	}
+
+	u64 total_kangs = GpuKangs[0]->CalcKangCnt();
+	for (int i = 1; i < GpuCnt; i++)
+		total_kangs += GpuKangs[i]->CalcKangCnt();
+	double path_single_kang = ops / total_kangs;	
+	double DPs_per_kang = path_single_kang / dp_val;
+	printf("Estimated DPs per kangaroo: %.3f.%s\r\n", DPs_per_kang, (DPs_per_kang < 5) ? " DP overhead is big, use less DP value if possible!" : "");
+
+	if (!gGenMode && gTamesFileName[0])
+	{
+		printf("load tames...\r\n");
+		if (db.LoadFromFile(gTamesFileName))
+		{
+			printf("tames loaded\r\n");
+			if (db.Header[0] != gRange)
+			{
+				printf("loaded tames have different range, they cannot be used, clear\r\n");
+				db.Clear();
+			}
+		}
+		else
+			printf("tames loading failed\r\n");
+	}
+
+	SetRndSeed(0); //use same seed to make tames from file compatible
+	PntTotalOps = 0;
+	PntIndex = 0;
+//prepare jumps
+	EcInt minjump, t;
+	minjump.Set(1);
+	minjump.ShiftLeft(Range / 2 + 3);
+	for (int i = 0; i < JMP_CNT; i++)
+	{
+		EcJumps1[i].dist = minjump;
+		t.RndMax(minjump);
+		EcJumps1[i].dist.Add(t);
+		EcJumps1[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even
+		EcJumps1[i].p = ec.MultiplyG(EcJumps1[i].dist);
+	}
+
+	minjump.Set(1);
+	minjump.ShiftLeft(Range - 10); //large jumps for L1S2 loops. Must be almost RANGE_BITS
+	for (int i = 0; i < JMP_CNT; i++)
+	{
+		EcJumps2[i].dist = minjump;
+		t.RndMax(minjump);
+		EcJumps2[i].dist.Add(t);
+		EcJumps2[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even
+		EcJumps2[i].p = ec.MultiplyG(EcJumps2[i].dist);
+	}
+
+	minjump.Set(1);
+	minjump.ShiftLeft(Range - 10 - 2); //large jumps for loops >2
+	for (int i = 0; i < JMP_CNT; i++)
+	{
+		EcJumps3[i].dist = minjump;
+		t.RndMax(minjump);
+		EcJumps3[i].dist.Add(t);
+		EcJumps3[i].dist.data[0] &= 0xFFFFFFFFFFFFFFFE; //must be even
+		EcJumps3[i].p = ec.MultiplyG(EcJumps3[i].dist);
+	}
+	SetRndSeed(GetTickCount64());
+
+	Int_HalfRange.Set(1);
+	Int_HalfRange.ShiftLeft(Range - 1);
+	Pnt_HalfRange = ec.MultiplyG(Int_HalfRange);
+	Pnt_NegHalfRange = Pnt_HalfRange;
+	Pnt_NegHalfRange.y.NegModP();
+	Int_TameOffset.Set(1);
+	Int_TameOffset.ShiftLeft(Range - 1);
+	EcInt tt;
+	tt.Set(1);
+	tt.ShiftLeft(Range - 5); //half of tame range width
+	Int_TameOffset.Sub(tt);
+	gPntToSolve = PntToSolve;
+
+//prepare GPUs
+	for (int i = 0; i < GpuCnt; i++)
+		if (!GpuKangs[i]->Prepare(PntToSolve, Range, DP, EcJumps1, EcJumps2, EcJumps3))
+		{
+			GpuKangs[i]->Failed = true;
+			printf("GPU %d Prepare failed\r\n", GpuKangs[i]->CudaIndex);
+		}
+
+	u64 tm0 = GetTickCount64();
+	printf("GPUs started...\r\n");
+
+#ifdef _WIN32
+	HANDLE thr_handles[MAX_GPU_CNT];
+#else
+	pthread_t thr_handles[MAX_GPU_CNT];
+#endif
+
+	u32 ThreadID;
+	gSolved = false;
+	ThrCnt = GpuCnt;
+	for (int i = 0; i < GpuCnt; i++)
+	{
+#ifdef _WIN32
+		thr_handles[i] = (HANDLE)_beginthreadex(NULL, 0, kang_thr_proc, (void*)GpuKangs[i], 0, &ThreadID);
+#else
+		pthread_create(&thr_handles[i], NULL, kang_thr_proc, (void*)GpuKangs[i]);
+#endif
+	}
+
+	u64 tm_stats = GetTickCount64();
+	while (!gSolved)
+	{
+		CheckNewPoints();
+		Sleep(10);
+		if (GetTickCount64() - tm_stats > 10 * 1000)
+		{
+			ShowStats(tm0, ops, dp_val);
+			tm_stats = GetTickCount64();
+		}
+
+		if ((MaxTotalOps > 0.0) && (PntTotalOps > MaxTotalOps))
+		{
+			gIsOpsLimit = true;
+			printf("Operations limit reached\r\n");
+			break;
+		}
+	}
+
+	printf("Stopping work ...\r\n");
+	for (int i = 0; i < GpuCnt; i++)
+		GpuKangs[i]->Stop();
+	while (ThrCnt)
+		Sleep(10);
+	for (int i = 0; i < GpuCnt; i++)
+	{
+#ifdef _WIN32
+		CloseHandle(thr_handles[i]);
+#else
+		pthread_join(thr_handles[i], NULL);
+#endif
+	}
+
+	if (gIsOpsLimit)
+	{
+		if (gGenMode)
+		{
+			printf("saving tames...\r\n");
+			db.Header[0] = gRange; 
+			if (db.SaveToFile(gTamesFileName))
+				printf("tames saved\r\n");
+			else
+				printf("tames saving failed\r\n");
+		}
+		db.Clear();
+		return false;
+	}
+
+	double K = (double)PntTotalOps / pow(2.0, Range / 2.0);
+	printf("Point solved, K: %.3f (with DP and GPU overheads)\r\n\r\n", K);
+	db.Clear();
+	*pk_res = gPrivKey;
+	return true;
+}
+
+bool ParseCommandLine(int argc, char* argv[])
+{
+	int ci = 1;
+	while (ci < argc)
+	{
+		char* argument = argv[ci];
+		ci++;
+		if (strcmp(argument, "-gpu") == 0)
+		{
+			if (ci >= argc)
+			{
+				printf("error: missed value after -gpu option\r\n");
+				return false;
+			}
+			char* gpus = argv[ci];
+			ci++;
+			memset(gGPUs_Mask, 0, sizeof(gGPUs_Mask));
+			for (int i = 0; i < (int)strlen(gpus); i++)
+			{
+				if ((gpus[i] < '0') || (gpus[i] > '9'))
+				{
+					printf("error: invalid value for -gpu option\r\n");
+					return false;
+				}
+				gGPUs_Mask[gpus[i] - '0'] = 1;
+			}
+		}
+		else
+		if (strcmp(argument, "-dp") == 0)
+		{
+			int val = atoi(argv[ci]);
+			ci++;
+			if ((val < 14) || (val > 60))
+			{
+				printf("error: invalid value for -dp option\r\n");
+				return false;
+			}
+			gDP = val;
+		}
+		else
+		if (strcmp(argument, "-range") == 0)
+		{
+			int val = atoi(argv[ci]);
+			ci++;
+			if ((val < 32) || (val > 170))
+			{
+				printf("error: invalid value for -range option\r\n");
+				return false;
+			}
+			gRange = val;
+		}
+		else
+		if (strcmp(argument, "-start") == 0)
+		{	
+			if (!gStart.SetHexStr(argv[ci]))
+			{
+				printf("error: invalid value for -start option\r\n");
+				return false;
+			}
+			ci++;
+			gStartSet = true;
+		}
+		else
+		if (strcmp(argument, "-pubkey") == 0)
+		{
+			if (!gPubKey.SetHexStr(argv[ci]))
+			{
+				printf("error: invalid value for -pubkey option\r\n");
+				return false;
+			}
+			ci++;
+		}
+		else
+		if (strcmp(argument, "-tame-ratio") == 0)
+		{
+			if (ci >= argc) { printf("error: missed value after -tame-ratio option\r\n"); return false; }
+			int ratio = atoi(argv[ci]); ci++;
+			if (ratio < 1 || ratio > 90) { printf("error: invalid value for -tame-ratio (1..90)\r\n"); return false; }
+			gTameRatioPct = ratio;
+		}
+		else
+		if (strcmp(argument, "-tame-bits") == 0)
+		{
+			if (ci >= argc) { printf("error: missed value after -tame-bits option\r\n"); return false; }
+			int bits = atoi(argv[ci]); ci++;
+			if (bits < 1 || bits > 32) { printf("error: invalid value for -tame-bits (1..32)\r\n"); return false; }
+			gTameBitsOffset = bits;
+		}
+
+		else
+		if (strcmp(argument, "-tames") == 0)
+		{
+			if (ci >= argc) { printf("error: missed value after -tames option\r\n"); return false; }
+			strncpy(gTamesFileName, argv[ci], sizeof(gTamesFileName)-1);
+			gTamesFileName[sizeof(gTamesFileName)-1] = 0;
+			ci++;
+		}
+		else
+		if (strcmp(argument, "-max") == 0)
+		{
+			if (ci >= argc) { printf("error: missed value after -max option\r\n"); return false; }
+			double val = atof(argv[ci]); ci++;
+			if (!(val > 0.0)) { printf("error: invalid value for -max option\r\n"); return false; }
+			gMax = val;
+		}
+else
+		{
+			printf("error: unknown option %s\r\n", argument);
+			return false;
+		}
+	}
+	if (!gPubKey.x.IsZero())
+		if (!gStartSet || !gRange || !gDP)
+		{
+			printf("error: you must also specify -dp, -range and -start options\r\n");
+			return false;
+		}
+	if (gTamesFileName[0] && !IsFileExist(gTamesFileName))
+	{
+		if (gMax == 0.0)
+		{
+			printf("error: you must also specify -max option to generate tames\r\n");
+			return false;
+		}
+		gGenMode = true;
+	}
+	return true;
+}
+
+int main(int argc, char* argv[])
+{
+#ifdef _DEBUG	
+	_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
+#endif
+
+	printf("********************************************************************************\r\n");
+	printf("*                    RCKangaroo v3.0  (c) 2024 RetiredCoder                    *\r\n");
+	printf("********************************************************************************\r\n\r\n");
+
+	printf("This software is free and open-source: https://github.com/RetiredC\r\n");
+	printf("It demonstrates fast GPU implementation of SOTA Kangaroo method for solving ECDLP\r\n");
+
+#ifdef _WIN32
+	printf("Windows version\r\n");
+#else
+	printf("Linux version\r\n");
+#endif
+
+#ifdef DEBUG_MODE
+	printf("DEBUG MODE\r\n\r\n");
+#endif
+
+	InitEc();
+	gDP = 0;
+	gRange = 0;
+	gStartSet = false;
+	gTamesFileName[0] = 0;
+	gMax = 0.0;
+	gGenMode = false;
+	gIsOpsLimit = false;
+	memset(gGPUs_Mask, 1, sizeof(gGPUs_Mask));
+	if (!ParseCommandLine(argc, argv))
+		return 0;
+
+	InitGpus();
+
+	if (!GpuCnt)
+	{
+		printf("No supported GPUs detected, exit\r\n");
+		return 0;
+	}
+
+	pPntList = (u8*)malloc(MAX_CNT_LIST * GPU_DP_SIZE);
+	pPntList2 = (u8*)malloc(MAX_CNT_LIST * GPU_DP_SIZE);
+	TotalOps = 0;
+	TotalSolved = 0;
+	gTotalErrors = 0;
+	IsBench = gPubKey.x.IsZero();
+
+	if (!IsBench && !gGenMode)
+	{
+		printf("\r\nMAIN MODE\r\n\r\n");
+		EcPoint PntToSolve, PntOfs;
+		EcInt pk, pk_found;
+
+		PntToSolve = gPubKey;
+		if (!gStart.IsZero())
+		{
+			PntOfs = ec.MultiplyG(gStart);
+			PntOfs.y.NegModP();
+			PntToSolve = ec.AddPoints(PntToSolve, PntOfs);
+		}
+
+		char sx[100], sy[100];
+		gPubKey.x.GetHexStr(sx);
+		gPubKey.y.GetHexStr(sy);
+		printf("Solving public key\r\nX: %s\r\nY: %s\r\n", sx, sy);
+		gStart.GetHexStr(sx);
+		printf("Offset: %s\r\n", sx);
+
+		if (!SolvePoint(PntToSolve, gRange, gDP, &pk_found))
+		{
+			if (!gIsOpsLimit)
+				printf("FATAL ERROR: SolvePoint failed\r\n");
+			goto label_end;
+		}
+		pk_found.AddModP(gStart);
+		EcPoint tmp = ec.MultiplyG(pk_found);
+		if (!tmp.IsEqual(gPubKey))
+		{
+			printf("FATAL ERROR: SolvePoint found incorrect key\r\n");
+			goto label_end;
+		}
+		//happy end
+		char s[100];
+		pk_found.GetHexStr(s);
+		printf("\r\nPRIVATE KEY: %s\r\n\r\n", s);
+		FILE* fp = fopen("RESULTS.TXT", "a");
+		if (fp)
+		{
+			fprintf(fp, "PRIVATE KEY: %s\n", s);
+			fclose(fp);
+		}
+		else //we cannot save the key, show error and wait forever so the key is displayed
+		{
+			printf("WARNING: Cannot save the key to RESULTS.TXT!\r\n");
+			while (1)
+				Sleep(100);
+		}
+	}
+	else
+	{
+		if (gGenMode)
+			printf("\r\nTAMES GENERATION MODE\r\n");
+		else
+			printf("\r\nBENCHMARK MODE\r\n");
+		//solve points, show K
+		while (1)
+		{
+			EcInt pk, pk_found;
+			EcPoint PntToSolve;
+
+			if (!gRange)
+				gRange = 78;
+			if (!gDP)
+				gDP = 16;
+
+			//generate random pk
+			pk.RndBits(gRange);
+			PntToSolve = ec.MultiplyG(pk);
+
+			if (!SolvePoint(PntToSolve, gRange, gDP, &pk_found))
+			{
+				if (!gIsOpsLimit)
+					printf("FATAL ERROR: SolvePoint failed\r\n");
+				break;
+			}
+			if (!pk_found.IsEqual(pk))
+			{
+				printf("FATAL ERROR: Found key is wrong!\r\n");
+				break;
+			}
+			TotalOps += PntTotalOps;
+			TotalSolved++;
+			u64 ops_per_pnt = TotalOps / TotalSolved;
+			double K = (double)ops_per_pnt / pow(2.0, gRange / 2.0);
+			printf("Points solved: %d, average K: %.3f (with DP and GPU overheads)\r\n", TotalSolved, K);
+			//if (TotalSolved >= 100) break; //dbg
+		}
+	}
+label_end:
+	for (int i = 0; i < GpuCnt; i++)
+		delete GpuKangs[i];
+	DeInitEc();
+	free(pPntList2);
+	free(pPntList);
+}
diff --git a/RCKangaroo.sln b/RCKangaroo.sln
deleted file mode 100644
index f5d8c2a..0000000
--- a/RCKangaroo.sln
+++ /dev/null
@@ -1,28 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.12.35514.174 d17.12
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RCKangaroo", "RCKangaroo.vcxproj", "{B7EF30AA-1D02-4EC4-A835-08766EC4A094}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|x64 = Debug|x64
-		Debug|x86 = Debug|x86
-		Release|x64 = Release|x64
-		Release|x86 = Release|x86
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Debug|x64.ActiveCfg = Debug|x64
-		{B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Debug|x64.Build.0 = Debug|x64
-		{B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Debug|x86.ActiveCfg = Debug|Win32
-		{B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Debug|x86.Build.0 = Debug|Win32
-		{B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Release|x64.ActiveCfg = Release|x64
-		{B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Release|x64.Build.0 = Release|x64
-		{B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Release|x86.ActiveCfg = Release|Win32
-		{B7EF30AA-1D02-4EC4-A835-08766EC4A094}.Release|x86.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
diff --git a/RCKangaroo.vcxproj b/RCKangaroo.vcxproj
deleted file mode 100644
index 53916d9..0000000
--- a/RCKangaroo.vcxproj
+++ /dev/null
@@ -1,173 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>17.0</VCProjectVersion>
-    <Keyword>Win32Proj</Keyword>
-    <ProjectGuid>{b7ef30aa-1d02-4ec4-a835-08766ec4a094}</ProjectGuid>
-    <RootNamespace>RCKangaroo</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v143</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v143</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v143</PlatformToolset>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v143</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-    <UseOfMfc>Static</UseOfMfc>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 12.6.props" />
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <ConformanceMode>true</ConformanceMode>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <ConformanceMode>true</ConformanceMode>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <ConformanceMode>true</ConformanceMode>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
-      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-    <CudaCompile>
-      <CodeGeneration>compute_89,sm_89;compute_86,sm_86;compute_75,sm_75;compute_61,sm_61</CodeGeneration>
-      <MaxRegCount>0</MaxRegCount>
-      <FastMath>false</FastMath>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <ConformanceMode>true</ConformanceMode>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-    <CudaCompile>
-      <CodeGeneration>compute_89,sm_89;compute_86,sm_86;compute_75,sm_75;compute_61,sm_61</CodeGeneration>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="Ec.cpp">
-      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
-      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Default</BasicRuntimeChecks>
-      <FavorSizeOrSpeed Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Speed</FavorSizeOrSpeed>
-      <DebugInformationFormat Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ProgramDatabase</DebugInformationFormat>
-    </ClCompile>
-    <ClCompile Include="GpuKang.cpp" />
-    <ClCompile Include="RCKangaroo.cpp" />
-    <ClCompile Include="utils.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="defs.h" />
-    <ClInclude Include="Ec.h" />
-    <ClInclude Include="GpuKang.h" />
-    <ClInclude Include="RCGpuUtils.h" />
-    <ClInclude Include="utils.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <CudaCompile Include="RCGpuCore.cu">
-      <GPUDebugInfo Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</GPUDebugInfo>
-      <PtxAsOptionV Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</PtxAsOptionV>
-      <PtxAsOptionV Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</PtxAsOptionV>
-    </CudaCompile>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 12.6.targets" />
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/RCKangaroo.vcxproj.user b/RCKangaroo.vcxproj.user
deleted file mode 100644
index ac7bdd1..0000000
--- a/RCKangaroo.vcxproj.user
+++ /dev/null
@@ -1,11 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LocalDebuggerCommandArguments>-gpu 0 -tames tames78.dat -max 10</LocalDebuggerCommandArguments>
-    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LocalDebuggerCommandArguments>-gpu 0 -dp 16 -range 84 -start 1000000000000000000000 -pubkey 0329c4574a4fd8c810b7e42a4b398882b381bcd85e40c6883712912d167c83e73a</LocalDebuggerCommandArguments>
-    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
-  </PropertyGroup>
-</Project>
\ No newline at end of file
diff --git a/README.md b/README.md
deleted file mode 100644
index b272088..0000000
--- a/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-(c) 2024, RetiredCoder (RC)
-
-RCKangaroo is free and open-source (GPLv3).
-This software demonstrates efficient GPU implementation of SOTA Kangaroo method for solving ECDLP. 
-It's part #3 of my research, you can find more details here: https://github.com/RetiredC
-
-Discussion thread: https://bitcointalk.org/index.php?topic=5517607
-
-<b>Features:</b>
-
-- Lowest K=1.15, it means 1.8 times less required operations compared to classic method with K=2.1, also it means that you need 1.8 times less memory to store DPs.
-- Fast, about 8GKeys/s on RTX 4090, 4GKeys/s on RTX 3090.
-- Keeps DP overhead as small as possible.
-- Supports ranges up to 170 bits.
-- Both Windows and Linux are supported.
-
-<b>Limitations:</b>
-
-- No advanced features like networking, saving/loading DPs, etc.
-
-<b>Command line parameters:</b>
-
-<b>-gpu</b>		which GPUs are used, for example, "035" means that GPUs #0, #3 and #5 are used. If not specified, all available GPUs are used. 
-
-<b>-pubkey</b>		public key to solve, both compressed and uncompressed keys are supported. If not specified, software starts in benchmark mode and solves random keys. 
-
-<b>-start</b>		start offset of the key, in hex. Mandatory if "-pubkey" option is specified. For example, for puzzle #85 start offset is "1000000000000000000000". 
-
-<b>-range</b>		bit range of private the key. Mandatory if "-pubkey" option is specified. For example, for puzzle #85 bit range is "84" (84 bits). Must be in range 32...170. 
-
-<b>-dp</b>		DP bits. Must be in range 14...60. Low DP bits values cause larger DB but reduces DP overhead and vice versa. 
-
-<b>-max</b>		option to limit max number of operations. For example, value 5.5 limits number of operations to 5.5 * 1.15 * sqrt(range), software stops when the limit is reached. 
-
-<b>-tames</b>		filename with tames. If file not found, software generates tames (option "-max" is required) and saves them to the file. If the file is found, software loads tames to speedup solving. 
-
-When public key is solved, software displays it and also writes it to "RESULTS.TXT" file. 
-
-Sample command line for puzzle #85:
-
-RCKangaroo.exe -dp 16 -range 84 -start 1000000000000000000000 -pubkey 0329c4574a4fd8c810b7e42a4b398882b381bcd85e40c6883712912d167c83e73a
-
-Sample command to generate tames:
-
-RCKangaroo.exe -dp 16 -range 76 -tames tames76.dat -max 10
-
-Then you can restart software with same parameters to see less K in benchmark mode or add "-tames tames76.dat" to solve some public key in 76-bit range faster.
-
-<b>Some notes:</b>
-
-Fastest ECDLP solvers will always use SOTA/SOTA+ method, as it's 1.4/1.5 times faster and requires less memory for DPs compared to the best 3-way kangaroos with K=1.6. 
-Even if you already have a faster implementation of kangaroo jumps, incorporating SOTA method will improve it further. 
-While adding the necessary loop-handling code will cause you to lose about 5–15% of your current speed, the SOTA method itself will provide a 40% performance increase. 
-Overall, this translates to roughly a 25% net improvement, which should not be ignored if your goal is to build a truly fast solver. 
-
-
-<b>Changelog:</b>
-
-v3.0:
-
-- added "-tames" and "-max" options.
-- fixed some bugs.
-
-v2.0:
-
-- added support for 30xx, 20xx and 1xxx cards.
-- some minor changes.
-
-v1.1:
-
-- added ability to start software on 30xx cards.
-
-v1.0:
-
-- initial release.
\ No newline at end of file
diff --git a/README_es_v16.md b/README_es_v16.md
new file mode 100644
index 0000000..40aa0fa
--- /dev/null
+++ b/README_es_v16.md
@@ -0,0 +1,228 @@
+# RCKangaroo v1.5 (build local)
+
+Implementación de *Kangaroo ECDLP* acelerada por GPU con varias mejoras
+orientadas a **tiempo de cómputo** y **uso de memoria / I/O**.
+
+> Probado con CUDA 12.x y NVIDIA RTX 3060 (SM 8.6).  
+> Esta rama mantiene la **CLI original** y añade banderas/guías para *tame tuning* y *benchmarks*
+
+
+## Novedades técnicas (V1.5)
+
+1) **Coordenadas Jacobianas en GPU** (opt-in)
+   - Suma/doblado en Jacobiano para evitar inversiones modulares por paso.
+   - Conversión a afin **solo cuando hace falta** (p.ej. para DPs o salida).
+   - Add mixta (*Jacobian + Affine precomp*) para puntos de salto.
+   - Conmutador de compilación: `USE_JACOBIAN=1` (habilitado en build por defecto de esta rama).
+
+2) **Inversión por lotes (Truco de Montgomery)**
+   - Se invierten muchos `Z` con **una sola inversión** y productos hacia delante/atrás.
+   - Útil en compactación/normalización de estados y/o verificaciones masivas.
+
+3) **TAMES v1.5 – formato compacto**
+   - **~30–35% menos tamaño** vs. formato clásico en nuestros tests (p.ej. 84 MB → 57 MB).
+   - Layout contiguo + compresión ligera (delta + varint/RLE) y lectura *streaming*.
+   - Carga más rápida y menos *pressure* de caché/L2/PCIe.
+   - **Compatibilidad**: el binario sigue aceptando el formato anterior; si el archivo no es v1.5, se lee por la ruta clásica.
+
+4) **Menos I/O y binario optimizado**
+   - Flags `-ffunction-sections -fdata-sections` (host) y `-Xfatbin=-compress-all` (device).
+   - Caché L1/tex en *ptxas* via `-Xptxas -dlcm=ca` en `build.sh`.
+
+> Nota: la *Montgomery Ladder* también está disponible en el código, pero no se fuerza por CLI; se usó Jacobiano + ventanas clásicas/mixtas, que mostraron mejor relación velocidad/uso de recursos en Ampere.
+
+---
+
+---
+
+## 🚀 Novedades en v1.6
+
+### Mejoras en GPU
+- **Atómicas warp-aggregadas en emisión de DPs**: reduce de 32 atómicas por warp a 1, con escrituras coalescentes. **+10–30% rendimiento** según GPU y -dp.
+- **Mejor coalescencia de memoria** en DPs y transferencias PCIe.
+
+### Nuevo formato `.dat` (v1.6)
+- **28B por registro DP** (vs 32B en v1.5).
+  - Cola de X: 5 bytes (antes 9).
+  - Distancia: 22 bytes.
+  - Tipo: 1 byte.
+- **Etiqueta de archivo `TMBM16`** identifica el nuevo formato.
+- **Compatibilidad hacia atrás**: lectura de v1.5 y v1.6.
+
+### Benchmarks (RTX 3060)
+- v1.5: ~750 MKey/s @ -dp 16.
+- v1.6: ~870 MKey/s @ -dp 16.
+- ~16% más rápido y ~12.5% menos tamaño en `.dat`.
+
+
+
+## Archivos modificados / añadidos
+
+- **`RCGpuCore.cu`**  
+  Implementaciones Jacobianas (doble/suma mixta), camino de *batch inversion* y selección de kernels según `USE_JACOBIAN`.
+
+- **`RCGpuUtils.h`**  
+  Primitivas de campo y helpers para Jacobiano (doble / add mixed).
+
+- **`utils.h`, `utils.cpp`**  
+  - Nueva ruta de **lectura/escritura TAMES v1.5** (streaming, compacta).  
+  - Limpieza de utilidades y helpers varios.
+
+- **`GpuKang.cpp`, `GpuKang.h`**  
+  - Parámetros de *tame tuning* (ratio y bits) expuestos para benchs controlados.
+  - Generación de distancias y partición *tame/wild* estable.
+
+- **`RCKangaroo.cpp`**  
+  - Parsing de CLI y *guard-rails* (mensajes de error consistentes).  
+  - Modo *bench* más verboso.
+
+- **`Makefile`**  
+  - Objetivo directo para `rckangaroo` (sin librerías intermedias).  
+  - Soporte `SM`, `USE_JACOBIAN`, `PROFILE` y *linking* determinista.
+
+- **Scripts de apoyo**  
+  - `build.sh` – *wrapper* de compilación multi-SM.
+  - `bench_grid.sh` – *sweep* de parámetros (dp / tame-bits / tame-ratio) con repetición y logs.
+  - `bench_rck.sh` – *benchmark* de A/B rápido.
+  - `summarize_bench.py` – parser de logs → CSV (speed, tiempo real, RSS, parámetros).
+
+---
+
+## Árbol del proyecto (esta rama)
+
+```
+.
+├── logs/                          # salida de bench_grid.sh
+├── bench_grid.sh
+├── bench_rck.sh
+├── build.sh
+├── Makefile
+├── defs.h
+├── Ec.cpp
+├── Ec.h
+├── GpuKang.cpp
+├── GpuKang.h
+├── RCGpuCore.cu
+├── RCGpuUtils.h
+├── RCKangaroo.cpp
+├── rckangaroo                   # binario (tras build)
+├── summarize_bench.py
+├── tames71.dat                  # ejemplo formato clásico
+├── tames71_v15.dat              # ejemplo formato v1.5 (compacto)
+├── utils.cpp
+└── utils.h
+```
+
+---
+
+## Compilación
+
+### Opción A – `build.sh` (recomendada)
+```bash
+# Sintaxis: ./build.sh <SM> <USE_JACOBIAN 0|1> <profile: release|debug>
+./build.sh 86 1 release     # RTX 3060 (SM 8.6), Jacobiano ON
+./build.sh 86 0 release     # Jacobiano OFF (afin) para A/B
+```
+Genera `./rckangaroo` en el directorio actual.
+
+### Opción B – `make`
+```bash
+# Variables: SM, USE_JACOBIAN, PROFILE=(release|debug)
+make SM=86 USE_JACOBIAN=1 PROFILE=release -j
+```
+
+> Requisitos: CUDA 12+, `g++` C++17, driver suficiente para la SM objetivo.
+
+
+---
+
+## Modo de uso (CLI)
+
+Ejemplo mínimo (con TAMES v1.5):
+```bash
+./rckangaroo \
+  -pubkey 0290e6900a58d33393bc1097b5aed31f2e4e7cbd3e5466af958665bc0121248483 \
+  -range 71 \
+  -dp 16 \
+  -start 0 \
+  -tames tames71_v15.dat
+```
+
+Parámetros útiles de *tame tuning* (se pasan por CLI y se reflejan en logs de bench):
+```
+  -tame-bits <N>      # bits usados para los saltos "tame" (p.ej. 4–7)
+  -tame-ratio <PCT>   # porcentaje de canguros tame (p.ej. 25–50)
+```
+Ejemplo:
+```bash
+./rckangaroo ... -tame-bits 4 -tame-ratio 33
+```
+
+> Sugerencia: buscar combinaciones que **maximicen MKeys/s** pero con **menor tiempo real** y **memoria** aceptable.
+
+
+---
+
+## Benchmarks automatizados
+
+### Barrido de parámetros (grilla)
+```bash
+# Editar cabezal del archivo para ajustar PUBKEY/RANGE/DP/TAMES/etc.
+chmod +x bench_grid.sh summarize_bench.py
+
+# Ejecutar barrido (graba todo en logs/)
+./bench_grid.sh
+
+# Resumir a CSV y visualizar
+python3 summarize_bench.py logs > summary.csv
+column -s, -t < summary.csv | less -S
+```
+Comparativa Jacobiano OFF/ON:
+```bash
+# Jacobiano ON
+./build.sh 86 1 release && MODE_TAG="j1" ./bench_grid.sh
+python3 summarize_bench.py logs > summary_j1.csv
+
+# Jacobiano OFF
+./build.sh 86 0 release && MODE_TAG="j0" ./bench_grid.sh
+python3 summarize_bench.py logs > summary_j0.csv
+```
+
+> **TIP**: Dejá `REPEATS>=5` para mitigar jitter; el parser reporta **medianas** por combinación.
+
+
+---
+
+## Resultados de referencia (orientativos)
+
+En pruebas rápidas de 71 bits en RTX 3060:
+- **TAMES v1.5**: 84 MB → **57 MB** (~32% menor).  
+- **Tiempo real**: ~100 s → **~65 s** (Jacobiano + v1.5 + mismos parámetros).  
+- **RSS**: ligera reducción (≈ -20–30 MB según corrida).
+
+> Los números varían por DP, *tame-bits*, *tame-ratio*, reloj de la GPU y versión de driver.
+
+
+---
+
+## Compatibilidad y notas
+
+- El binario mantiene la lectura del formato de TAMES **clásico** y del **v1.5** (detectados por cabecera / heurística).  
+- Si necesitás convertir masivamente a v1.5, se recomienda regenerar con el *pipeline* que usás para crear los tames, apuntando al escritor v1.5 (ver `utils.cpp`).
+
+
+---
+
+## Solución de problemas
+
+- **`Unknown option -ffunction-sections` en NVCC**: usá `build.sh` (pasa por `-Xcompiler`).  
+- **`No rule to make target 'RCGpuCore.o'`**: asegurate de usar este repositorio / Makefile o `./build.sh`.
+- **`CUDA error / cap mismatch`**: compila con `./build.sh <tu SM> ...` (p.ej. 75 para Turing, 86 para Ampere).
+
+
+---
+
+## Licencia
+
+Mantiene la licencia del proyecto original (ver `LICENSE.TXT` si aplica).  
+Autorizado a ser usado con fines de investigación y educativos.
diff --git a/README_v16.md b/README_v16.md
new file mode 100644
index 0000000..faf8c43
--- /dev/null
+++ b/README_v16.md
@@ -0,0 +1,221 @@
+# RCKangaroo v1.5 (local build)
+
+GPU-accelerated **Kangaroo ECDLP** implementation with improvements focused on
+**compute time** and **memory / I/O** efficiency.
+
+> Tested with CUDA 12.x and NVIDIA RTX 3060 (SM 8.6).  
+> This branch keeps the **original CLI** and adds flags/guides for *tame tuning* and *benchmarks*.
+
+---
+
+---
+
+## 🚀 What’s New in v1.6
+
+### GPU Improvements
+- **Warp-aggregated atomics for DP emission**: reduced per-thread atomics to a single warp-level atomic, coalesced writes. **+10–30% throughput** depending on GPU and -dp.
+- **Better memory coalescing** for DPs and PCIe transfers.
+
+### New `.dat` Format (v1.6)
+- **28B per DP record** (vs 32B in v1.5).
+  - X tail: 5 bytes (was 9).
+  - Distance: 22 bytes.
+  - Type: 1 byte.
+- **File tag `TMBM16`** identifies new format.
+- **Backward compatible**: reads both v1.5 and v1.6.
+
+### Benchmarks (RTX 3060)
+- v1.5: ~750 MKey/s @ -dp 16.
+- v1.6: ~870 MKey/s @ -dp 16.
+- ~16% faster and ~12.5% smaller `.dat` files.
+
+
+
+## Technical Highlights (V1.5)
+
+1) **Jacobian Coordinates on GPU** (opt‑in)
+   - Point add/double in Jacobian to avoid modular inversions per step.
+   - Convert back to affine **only when needed** (e.g., DPs or output).
+   - Mixed add (*Jacobian + Affine precomp*) for jump points.
+   - Build switch: `USE_JACOBIAN=1` (enabled by default in this branch).
+
+2) **Batch Inversion (Montgomery trick)**
+   - Invert many `Z` values with **a single** field inversion using forward/backward products.
+   - Useful for compacting/normalizing states and bulk verifications.
+
+3) **TAMES v1.5 – compact file format**
+   - **~30–35% smaller** than the classic layout in our tests (e.g., 84 MB → 57 MB).
+   - Contiguous layout + light compression (delta + varint/RLE) with streaming reads.
+   - Faster load and lower cache/L2/PCIe pressure.
+   - **Compatibility**: the binary still accepts the classic format; if the file is not v1.5, it uses the legacy path.
+
+4) **Less I/O and optimized binary size**
+   - Host flags `-ffunction-sections -fdata-sections`; device fatbin compression `-Xfatbin=-compress-all`.
+   - L1/tex cache hint via `-Xptxas -dlcm=ca` in `build.sh`.
+
+> Note: *Montgomery Ladder* is available in code but not enforced via CLI; Jacobian + classic/mixed windows showed a better perf/resource balance on Ampere.
+
+---
+
+## Modified / Added Files
+
+- **`RCGpuCore.cu`**  
+  Jacobian implementations (double/mixed-add), batch inversion path, and kernel selection via `USE_JACOBIAN`.
+
+- **`RCGpuUtils.h`**  
+  Field primitives and helpers for Jacobian (double / mixed add).
+
+- **`utils.h`, `utils.cpp`**  
+  - New **TAMES v1.5 reader/writer** (streaming, compact).  
+  - Utility cleanups.
+
+- **`GpuKang.cpp`, `GpuKang.h`**  
+  - Exposed *tame tuning* parameters (ratio and bits) for controlled benches.
+  - Distance generation and stable tame/wild partitioning.
+
+- **`RCKangaroo.cpp`**  
+  - CLI parsing + guard rails (consistent error messages).  
+  - More verbose bench output.
+
+- **`Makefile`**  
+  - Direct `rckangaroo` target (no intermediate archives).  
+  - Support for `SM`, `USE_JACOBIAN`, `PROFILE`, and deterministic linking.
+
+- **Helper scripts**  
+  - `build.sh` – multi‑SM build wrapper.
+  - `bench_grid.sh` – parameter sweep (dp / tame-bits / tame-ratio) with repeats and logs.
+  - `bench_rck.sh` – quick A/B benchmark.
+  - `summarize_bench.py` – log parser → CSV (speed, wall time, RSS, parameters).
+
+---
+
+## Project Tree (this branch)
+
+```
+.
+├── logs/                          # output from bench_grid.sh
+├── bench_grid.sh
+├── bench_rck.sh
+├── build.sh
+├── Makefile
+├── defs.h
+├── Ec.cpp
+├── Ec.h
+├── GpuKang.cpp
+├── GpuKang.h
+├── RCGpuCore.cu
+├── RCGpuUtils.h
+├── RCKangaroo.cpp
+├── rckangaroo                   # binary after build
+├── summarize_bench.py
+├── tames71.dat                  # classic format example
+├── tames71_v15.dat              # v1.5 compact example
+├── utils.cpp
+└── utils.h
+```
+
+---
+
+## Build
+
+### Option A – `build.sh` (recommended)
+```bash
+# Syntax: ./build.sh <SM> <USE_JACOBIAN 0|1> <profile: release|debug>
+./build.sh 86 1 release     # RTX 3060 (SM 8.6), Jacobian ON
+./build.sh 86 0 release     # Jacobian OFF (affine) for A/B
+```
+Produces `./rckangaroo` in the current directory.
+
+### Option B – `make`
+```bash
+# Variables: SM, USE_JACOBIAN, PROFILE=(release|debug)
+make SM=86 USE_JACOBIAN=1 PROFILE=release -j
+```
+
+> Requirements: CUDA 12+, `g++` with C++17, and a driver supporting your target SM.
+
+
+---
+
+## Usage (CLI)
+
+Minimal example (with TAMES v1.5):
+```bash
+./rckangaroo \
+  -pubkey 0290e6900a58d33393bc1097b5aed31f2e4e7cbd3e5466af958665bc0121248483 \
+  -range 71 \
+  -dp 16 \
+  -start 0 \
+  -tames tames71_v15.dat
+```
+
+Tame tuning parameters (reflected in bench logs):
+```
+  -tame-bits <N>      # bits used for tame jumps (e.g., 4–7)
+  -tame-ratio <PCT>   # percent of tame kangaroos (e.g., 25–50)
+```
+Example:
+```bash
+./rckangaroo ... -tame-bits 4 -tame-ratio 33
+```
+
+> Goal: find combinations that **maximize MKeys/s** while minimizing **wall time** and keeping **memory** acceptable.
+
+
+---
+
+## Automated Benchmarks
+
+### Parameter sweep (grid)
+```bash
+# Edit the header of the script to set PUBKEY/RANGE/DP/TAMES/etc.
+chmod +x bench_grid.sh summarize_bench.py
+
+# Run grid (stores everything under logs/)
+./bench_grid.sh
+
+# Summarize to CSV
+python3 summarize_bench.py logs > summary.csv
+column -s, -t < summary.csv | less -S
+```
+Jacobian OFF/ON comparison:
+```bash
+# Jacobian ON
+./build.sh 86 1 release && MODE_TAG="j1" ./bench_grid.sh
+python3 summarize_bench.py logs > summary_j1.csv
+
+# Jacobian OFF
+./build.sh 86 0 release && MODE_TAG="j0" ./bench_grid.sh
+python3 summarize_bench.py logs > summary_j0.csv
+```
+
+> **TIP**: Use `REPEATS>=5` to reduce jitter; the parser reports **medians** per combination.
+
+
+---
+
+## Reference Results (indicative)
+
+Quick 71‑bit tests on an RTX 3060:
+- **TAMES v1.5**: 84 MB → **57 MB** (~32% smaller).  
+- **Wall time**: ~100 s → **~65 s** (Jacobian + v1.5 with same parameters).  
+- **RSS**: slight reduction (≈ −20–30 MB depending on run).
+
+> Numbers vary with DP, *tame-bits*, *tame-ratio*, GPU clocks, and driver version.
+
+
+---
+
+## Troubleshooting
+
+- **`Unknown option -ffunction-sections` from NVCC**: use `build.sh` (passes via `-Xcompiler`).  
+- **`No rule to make target 'RCGpuCore.o'`**: use this repo/Makefile or `./build.sh`.  
+- **`CUDA error / cap mismatch`**: compile via `./build.sh <your SM> ...` (e.g., 75 for Turing, 86 for Ampere).
+
+
+---
+
+## License
+
+Inherits the original project’s license (see `LICENSE.TXT` if present).  
+Permitted for research and educational use.
diff --git a/RESULTS.TXT b/RESULTS.TXT
new file mode 100644
index 0000000..2ede806
--- /dev/null
+++ b/RESULTS.TXT
@@ -0,0 +1,4 @@
+PRIVATE KEY: 0000000000000000000000000000000000000000000000349B84B6431A6C4EF1
+PRIVATE KEY: 0000000000000000000000000000000000000000000000349B84B6431A6C4EF1
+PRIVATE KEY: 0000000000000000000000000000000000000000000000101D83275FB2BC7E0C
+PRIVATE KEY: 00000000000000000000000000000000000000000000000BEBB3940CD0FC1491
diff --git a/bench_grid.sh b/bench_grid.sh
new file mode 100755
index 0000000..8b03c17
--- /dev/null
+++ b/bench_grid.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# ========= Config por defecto (sobre-escribibles por variables de entorno) =========
+PUBKEY="${PUBKEY:-0290e6900a58d33393bc1097b5aed31f2e4e7cbd3e5466af958665bc0121248483}"
+RANGE="${RANGE:-71}"
+START="${START:-0}"
+TAMES_FILE="${TAMES_FILE:-tames71_v15.dat}"   # Cambiá si querés otro archivo
+REPEATS="${REPEATS:-5}"
+DP_LIST="${DP_LIST:-14 15 16}"
+TAME_BITS_LIST="${TAME_BITS_LIST:-4 5}"
+TAME_RATIO_LIST="${TAME_RATIO_LIST:-25 33 40}"
+MODE_TAG="${MODE_TAG:-j1}"   # usa j1/j0 para distinguir Jacobiano ON/OFF
+LOGDIR="${LOGDIR:-logs}"
+
+mkdir -p "$LOGDIR"
+
+echo "== Bench grid =="
+echo "PUBKEY=$PUBKEY"
+echo "RANGE=$RANGE  START=$START"
+echo "TAMES_FILE=$TAMES_FILE"
+echo "REPEATS=$REPEATS"
+echo "DP_LIST=$DP_LIST"
+echo "TAME_BITS_LIST=$TAME_BITS_LIST"
+echo "TAME_RATIO_LIST=$TAME_RATIO_LIST"
+echo "MODE_TAG=$MODE_TAG"
+echo "LOGDIR=$LOGDIR"
+echo
+
+# Chequeos
+if [[ ! -x "./rckangaroo" ]]; then
+  echo "ERROR: ./rckangaroo no existe o no es ejecutable. Compilá primero." >&2
+  exit 1
+fi
+if [[ ! -f "$TAMES_FILE" ]]; then
+  echo "ERROR: no se encontró $TAMES_FILE" >&2
+  exit 1
+fi
+
+run_one() {
+  local dp="$1"
+  local tb="$2"
+  local tr="$3"
+  local r="$4"
+  local of="$LOGDIR/${MODE_TAG}_dp${dp}_tb${tb}_tr${tr}_run${r}.log"
+
+  echo ">> dp=$dp  tame-bits=$tb  tame-ratio=$tr  run=$r"
+  /usr/bin/time -f "%E real  %Mk RSS  %I in KB  %O out KB" \
+    ./rckangaroo -pubkey "$PUBKEY" -range "$RANGE" -dp "$dp" -start "$START" \
+                 -tames "$TAMES_FILE" -tame-bits "$tb" -tame-ratio "$tr" \
+      |& tee "$of"
+}
+
+for dp in $DP_LIST; do
+  for tb in $TAME_BITS_LIST; do
+    for tr in $TAME_RATIO_LIST; do
+      for r in $(seq 1 "$REPEATS"); do
+        run_one "$dp" "$tb" "$tr" "$r"
+      done
+    done
+  done
+done
+
+echo
+echo "== Listo. Logs guardados en $LOGDIR"
+echo "Sugerencia: python3 summarize_bench.py $LOGDIR > summary_${MODE_TAG}.csv"
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..10ac8bf
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SM="${1:-86}"
+USE_JACOBIAN="${2:-1}"
+PROFILE="${3:-release}"
+CUDA_PATH="${CUDA_PATH:-/usr/local/cuda-12.0}"
+
+HOST_COPT_release="-O3 -DNDEBUG -ffunction-sections -fdata-sections"
+HOST_COPT_debug="-O0 -g"
+DEV_COPT_release="-O3"
+DEV_COPT_debug="-O0 -g"
+
+if [[ "$PROFILE" == "release" ]]; then 
+  HOST_COPT="$HOST_COPT_release"
+  DEV_COPT="$DEV_COPT_release"
+else 
+  HOST_COPT="$HOST_COPT_debug"
+  DEV_COPT="$DEV_COPT_debug"
+fi
+
+CCFLAGS="-std=c++17 -I${CUDA_PATH}/include ${HOST_COPT} -DUSE_JACOBIAN=${USE_JACOBIAN}"
+NVCCFLAGS="-std=c++17 -arch=sm_${SM} ${DEV_COPT} -Xptxas -O3 -Xptxas -dlcm=ca -Xfatbin=-compress-all -DUSE_JACOBIAN=${USE_JACOBIAN}"
+NVCCXCOMP="-Xcompiler -ffunction-sections -Xcompiler -fdata-sections"
+LDFLAGS="-L${CUDA_PATH}/lib64 -lcudart -pthread"
+
+echo "== CCFLAGS:   ${CCFLAGS}"
+echo "== NVCCFLAGS: ${NVCCFLAGS} ${NVCCXCOMP}"
+
+# Compile C++
+g++ ${CCFLAGS} -c RCKangaroo.cpp -o RCKangaroo.o
+g++ ${CCFLAGS} -c GpuKang.cpp    -o GpuKang.o
+g++ ${CCFLAGS} -c Ec.cpp         -o Ec.o
+g++ ${CCFLAGS} -c utils.cpp      -o utils.o
+
+# Compile CUDA (if present)
+if [[ -f "RCGpuCore.cu" ]]; then
+  /usr/bin/nvcc ${NVCCFLAGS} ${NVCCXCOMP} -c RCGpuCore.cu -o RCGpuCore.o
+  g++ ${CCFLAGS} -o rckangaroo RCKangaroo.o GpuKang.o Ec.o utils.o RCGpuCore.o ${LDFLAGS}
+else
+  echo "WARN: RCGpuCore.cu no existe; enlazando CPU-only"
+  g++ ${CCFLAGS} -o rckangaroo RCKangaroo.o GpuKang.o Ec.o utils.o ${LDFLAGS}
+fi
+echo "== Listo: ./rckangaroo"
diff --git a/defs.h b/defs.h
index c84e192..724a527 100644
--- a/defs.h
+++ b/defs.h
@@ -1,97 +1,115 @@
-// This file is a part of RCKangaroo software
-// (c) 2024, RetiredCoder (RC)
-// License: GPLv3, see "LICENSE.TXT" file
-// https://github.com/RetiredC
-
-
-#pragma once 
-
-#pragma warning(disable : 4996)
-
-typedef unsigned long long u64;
-typedef long long i64;
-typedef unsigned int u32;
-typedef int i32;
-typedef unsigned short u16;
-typedef short i16;
-typedef unsigned char u8;
-typedef char i8;
-
-
-
-#define MAX_GPU_CNT			32
-
-//must be divisible by MD_LEN
-#define STEP_CNT			1000
-
-#define JMP_CNT				512
-
-//use different options for cards older than RTX 40xx
-#ifdef __CUDA_ARCH__
-	#if __CUDA_ARCH__ < 890
-		#define OLD_GPU
-	#endif
-	#ifdef OLD_GPU
-		#define BLOCK_SIZE			512
-		//can be 8, 16, 24, 32, 40, 48, 56, 64
-		#define PNT_GROUP_CNT		64	
-	#else
-		#define BLOCK_SIZE			256
-		//can be 8, 16, 24, 32
-		#define PNT_GROUP_CNT		24
-	#endif
-#else //CPU, fake values
-	#define BLOCK_SIZE			512
-	#define PNT_GROUP_CNT		64
-#endif
-
-// kang type
-#define TAME				0  // Tame kangs
-#define WILD1				1  // Wild kangs1 
-#define WILD2				2  // Wild kangs2
-
-#define GPU_DP_SIZE			48
-#define MAX_DP_CNT			(256 * 1024)
-
-#define JMP_MASK			(JMP_CNT-1)
-
-#define DPTABLE_MAX_CNT		16
-
-#define MAX_CNT_LIST		(512 * 1024)
-
-#define DP_FLAG				0x8000
-#define INV_FLAG			0x4000
-#define JMP2_FLAG			0x2000
-
-#define MD_LEN				10
-
-//#define DEBUG_MODE
-
-//gpu kernel parameters
-struct TKparams
-{
-	u64* Kangs;
-	u32 KangCnt;
-	u32 BlockCnt;
-	u32 BlockSize;
-	u32 GroupCnt;
-	u64* L2;
-	u64 DP;
-	u32* DPs_out;
-	u64* Jumps1; //x(32b), y(32b), d(32b)
-	u64* Jumps2; //x(32b), y(32b), d(32b)
-	u64* Jumps3; //x(32b), y(32b), d(32b)
-	u64* JumpsList; //list of all performed jumps, grouped by warp(32) every 8 groups (from PNT_GROUP_CNT). Each jump is 2 bytes: 10bit jump index + flags: INV_FLAG, DP_FLAG, JMP2_FLAG
-	u32* DPTable;
-	u32* L1S2;
-	u64* LastPnts;
-	u64* LoopTable;
-	u32* dbg_buf;
-	u32* LoopedKangs;
-	bool IsGenMode; //tames generation mode
-
-	u32 KernelA_LDS_Size;
-	u32 KernelB_LDS_Size;
-	u32 KernelC_LDS_Size;	
-};
-
+// This file is a part of RCKangaroo software
+// (c) 2024, RetiredCoder (RC)
+// License: GPLv3, see "LICENSE.TXT" file
+// https://github.com/RetiredC
+
+
+#pragma once 
+
+#pragma warning(disable : 4996)
+
+typedef unsigned long long u64;
+typedef long long i64;
+typedef unsigned int u32;
+typedef int i32;
+typedef unsigned short u16;
+typedef short i16;
+// === Feature flags (performance/algorithm toggles) ============================
+#ifndef USE_JACOBIAN
+// 0 = affine + inversión por lotes (actual predeterminado, muy rápido en GPU)
+// 1 = coordenadas jacobianas + conversión por lotes a afín sólo para DP
+#define USE_JACOBIAN 0
+#endif
+
+#ifndef SCALARMUL_W
+// Ventana para w-NAF (CPU). 4 es un buen equilibrio entre memoria y velocidad.
+#define SCALARMUL_W 4
+#endif
+
+#ifndef USE_MONTGOMERY_LADDER
+// 1 para habilitar la multiplicación escalar por Montgomery Ladder (CPU)
+#define USE_MONTGOMERY_LADDER 1
+#endif
+// ============================================================================
+
+typedef unsigned char u8;
+typedef char i8;
+
+
+
+#define MAX_GPU_CNT			32
+
+//must be divisible by MD_LEN
+#define STEP_CNT			1000
+
+#define JMP_CNT				512
+
+//use different options for cards older than RTX 40xx
+#ifdef __CUDA_ARCH__
+	#if __CUDA_ARCH__ < 890
+		#define OLD_GPU
+	#endif
+	#ifdef OLD_GPU
+		#define BLOCK_SIZE			512
+		//can be 8, 16, 24, 32, 40, 48, 56, 64
+		#define PNT_GROUP_CNT		64	
+	#else
+		#define BLOCK_SIZE			256
+		//can be 8, 16, 24, 32
+		#define PNT_GROUP_CNT		24
+	#endif
+#else //CPU, fake values
+	#define BLOCK_SIZE			512
+	#define PNT_GROUP_CNT		64
+#endif
+
+// kang type
+#define TAME				0  // Tame kangs
+#define WILD1				1  // Wild kangs1 
+#define WILD2				2  // Wild kangs2
+
+#define GPU_DP_SIZE			48
+#define MAX_DP_CNT			(256 * 1024)
+
+#define JMP_MASK			(JMP_CNT-1)
+
+#define DPTABLE_MAX_CNT		16
+
+#define MAX_CNT_LIST		(512 * 1024)
+
+#define DP_FLAG				0x8000
+#define INV_FLAG			0x4000
+#define JMP2_FLAG			0x2000
+
+#define MD_LEN				10
+
+//#define DEBUG_MODE
+
+//gpu kernel parameters
+struct TKparams
+{
+	u64* Kangs;
+	u32 KangCnt;
+	u32 BlockCnt;
+	u32 BlockSize;
+	u32 GroupCnt;
+	u64* L2;
+	u64 DP;
+	u32* DPs_out;
+	u64* Jumps1; //x(32b), y(32b), d(32b)
+	u64* Jumps2; //x(32b), y(32b), d(32b)
+	u64* Jumps3; //x(32b), y(32b), d(32b)
+	u64* JumpsList; //list of all performed jumps, grouped by warp(32) every 8 groups (from PNT_GROUP_CNT). Each jump is 2 bytes: 10bit jump index + flags: INV_FLAG, DP_FLAG, JMP2_FLAG
+	u32* DPTable;
+	u32* L1S2;
+	u64* LastPnts;
+	u64* LoopTable;
+	u32* dbg_buf;
+	u32* LoopedKangs;
+	bool IsGenMode; //tames generation mode
+
+	u32 KernelA_LDS_Size;
+	u32 KernelB_LDS_Size;
+	u32 KernelC_LDS_Size;	
+};
+
diff --git a/gitignore b/gitignore
new file mode 100644
index 0000000..03555f4
--- /dev/null
+++ b/gitignore
@@ -0,0 +1,43 @@
+# RCKangaroo .gitignore
+
+# Build artifacts
+*.o
+*.obj
+*.exe
+*.out
+*.dll
+*.so
+*.dylib
+
+# CUDA intermediate files
+*.cubin
+*.fatbin
+*.ptx
+
+# Generated binaries
+RCKangaroo
+RCKangaroo.exe
+rckangaroo
+rckangaroo.exe
+
+# Logs and debug
+*.log
+*.tmp
+*.bak
+
+# Data files (distinguished points, etc.)
+*.dat
+*.bin
+
+# Editor/OS files
+.DS_Store
+Thumbs.db
+*.swp
+*.swo
+
+# IDE/project configs
+.vscode/
+.idea/
+*.user
+*.sln
+*.vcxproj*
diff --git a/rckangaroo b/rckangaroo
new file mode 100755
index 0000000..1a7f5dd
Binary files /dev/null and b/rckangaroo differ
diff --git a/summarize_bench.py b/summarize_bench.py
new file mode 100755
index 0000000..677859f
--- /dev/null
+++ b/summarize_bench.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+import sys, re, statistics, os, glob
+
+# Extrae el máximo 'Speed: XXX MKeys/s' en cada log (robusto a warmups)
+speed_re = re.compile(r"Speed:\s*([0-9]+(?:\.[0-9]+)?)\s*MKeys/s", re.IGNORECASE)
+
+# '/usr/bin/time -f "%E real  %Mk RSS  %I in KB  %O out KB"'
+time_re  = re.compile(r"(?P<real>(\d+:)?\d+\.\d+)\s+real")
+rss_re   = re.compile(r"\s(?P<rss>\d+)k\s+RSS", re.IGNORECASE)
+
+def parse_real_to_seconds(s):
+    # formatos "0:28.40" ó "12.34"
+    if ":" in s:
+        m, sec = s.split(":")
+        return int(m) * 60 + float(sec)
+    return float(s)
+
+def parse_log(path):
+    max_speed = None
+    real_s = None
+    rss_kb = None
+
+    with open(path, "r", errors="ignore") as f:
+        for line in f:
+            m = speed_re.search(line)
+            if m:
+                v = float(m.group(1))
+                if (max_speed is None) or (v > max_speed):
+                    max_speed = v
+
+        f.seek(0)
+        data = f.read()
+
+        tm = time_re.search(data)
+        if tm:
+            real_s = parse_real_to_seconds(tm.group("real"))
+        rm = rss_re.search(data)
+        if rm:
+            rss_kb = int(rm.group("rss"))
+
+    return max_speed, real_s, rss_kb
+
+def pick(val_list, func):
+    vals = [v for v in val_list if v is not None]
+    if not vals: return ""
+    return func(vals)
+
+def main():
+    if len(sys.argv) < 2:
+        print("Uso: summarize_bench.py <logdir> [pattern]", file=sys.stderr)
+        sys.exit(1)
+    logdir = sys.argv[1]
+    pattern = sys.argv[2] if len(sys.argv) >= 3 else "*.log"
+
+    files = glob.glob(os.path.join(logdir, pattern))
+    if not files:
+        print("No hay logs en", logdir, file=sys.stderr)
+        sys.exit(2)
+
+    # nombres: <tag>_dp<dp>_tb<tb>_tr<tr>_runX.log
+    import re
+    name_re = re.compile(r".*?_dp(?P<dp>\d+)_tb(?P<tb>\d+)_tr(?P<tr>\d+)_run(?P<run>\d+)\.log$")
+
+    groups = {}
+    for p in sorted(files):
+        m = name_re.match(os.path.basename(p))
+        if not m:
+            continue
+        dp = int(m.group("dp")); tb = int(m.group("tb")); tr = int(m.group("tr"))
+        metrics = parse_log(p)
+        groups.setdefault((dp,tb,tr), []).append(metrics)
+
+    print("dp,tame_bits,tame_ratio,runs,median_speed_MKeys,max_speed_MKeys,median_real_s,median_rss_kb")
+
+    for (dp,tb,tr), lst in sorted(groups.items()):
+        speeds  = [x[0] for x in lst if x[0] is not None]
+        reals   = [x[1] for x in lst if x[1] is not None]
+        rsslist = [x[2] for x in lst if x[2] is not None]
+
+        med_speed = pick(speeds, statistics.median)
+        max_speed = pick(speeds, max)
+        med_real  = pick(reals, statistics.median)
+        med_rss   = pick(rsslist, statistics.median)
+
+        def fmt(x):
+            if x == "": return ""
+            if isinstance(x, float): 
+                return f"{x:.3f}"
+            return str(x)
+
+        print(f"{dp},{tb},{tr},{len(lst)},{fmt(med_speed)},{fmt(max_speed)},{fmt(med_real)},{fmt(med_rss)}")
+
+if __name__ == '__main__':
+    main()
diff --git a/summary_j1.csv b/summary_j1.csv
new file mode 100644
index 0000000..d1d0104
--- /dev/null
+++ b/summary_j1.csv
@@ -0,0 +1,19 @@
+dp,tame_bits,tame_ratio,runs,median_speed_MKeys,max_speed_MKeys,median_real_s,median_rss_kb
+14,4,25,5,755.000,784.000,66.040,745940
+14,4,33,5,740.000,744.000,100.080,821616
+14,4,40,5,740.000,753.000,85.440,787120
+14,5,25,5,753.000,755.000,93.070,806608
+14,5,33,5,759.000,761.000,60.130,728480
+14,5,40,5,760.000,762.000,91.140,805548
+15,4,25,5,763.000,765.000,88.580,687048
+15,4,33,5,755.000,759.000,56.670,645252
+15,4,40,5,760.000,764.000,66.400,657748
+15,5,25,5,749.000,765.000,54.610,641992
+15,5,33,5,736.000,740.000,121.550,725884
+15,5,40,5,736.000,738.000,52.380,638124
+16,4,25,5,735.000,736.000,98.060,633456
+16,4,33,5,731.000,733.000,82.430,623392
+16,4,40,5,732.000,733.000,60.760,609024
+16,5,25,5,736.000,751.000,80.850,622144
+16,5,33,5,762.000,763.000,81.740,624944
+16,5,40,5,769.000,778.000,109.390,644316
diff --git a/utils.cpp b/utils.cpp
index 6665a75..b1a660e 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -1,312 +1,403 @@
-// This file is a part of RCKangaroo software
-// (c) 2024, RetiredCoder (RC)
-// License: GPLv3, see "LICENSE.TXT" file
-// https://github.com/RetiredC
-
-
-#include "utils.h"
-#include <wchar.h>
-
-#ifdef _WIN32
-
-#else
-
-void _BitScanReverse64(u32* index, u64 msk) 
-{
-    *index = 63 - __builtin_clzll(msk); 
-}
-
-void _BitScanForward64(u32* index, u64 msk) 
-{
-    *index = __builtin_ffsll(msk) - 1; 
-}
-
-u64 _umul128(u64 m1, u64 m2, u64* hi) 
-{ 
-    uint128_t ab = (uint128_t)m1 * m2; *hi = (u64)(ab >> 64); return (u64)ab; 
-}
-
-u64 __shiftright128 (u64 LowPart, u64 HighPart, u8 Shift)
-{
-   u64 ret;
-   __asm__ ("shrd {%[Shift],%[HighPart],%[LowPart]|%[LowPart], %[HighPart], %[Shift]}" 
-      : [ret] "=r" (ret)
-      : [LowPart] "0" (LowPart), [HighPart] "r" (HighPart), [Shift] "Jc" (Shift)
-      : "cc");
-   return ret;
-}
-
-u64 __shiftleft128 (u64 LowPart, u64 HighPart, u8 Shift)
-{
-   u64 ret;
-   __asm__ ("shld {%[Shift],%[LowPart],%[HighPart]|%[HighPart], %[LowPart], %[Shift]}" 
-      : [ret] "=r" (ret)
-      : [LowPart] "r" (LowPart), [HighPart] "0" (HighPart), [Shift] "Jc" (Shift)
-      : "cc");
-   return ret;
-}   
-
-u64 GetTickCount64()
-{
-	struct timespec ts;
-	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
-	return (u64)(ts.tv_nsec / 1000000) + ((u64)ts.tv_sec * 1000ull);
-}
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define DB_REC_LEN			32
-#define DB_FIND_LEN			9
-#define DB_MIN_GROW_CNT		2
-
-//we need advanced memory management to reduce memory fragmentation
-//everything will be stable up to about 8TB RAM
-
-#define MEM_PAGE_SIZE		(128 * 1024)
-#define RECS_IN_PAGE		(MEM_PAGE_SIZE / DB_REC_LEN)
-#define MAX_PAGES_CNT		(0xFFFFFFFF / RECS_IN_PAGE)
-
-MemPool::MemPool()
-{
-	pnt = 0;
-}
-
-MemPool::~MemPool()
-{
-	Clear();
-}
-
-void MemPool::Clear()
-{
-	int cnt = (int)pages.size();
-	for (int i = 0; i < cnt; i++)
-		free(pages[i]);
-	pages.clear();
-	pnt = 0;
-}
-
-void* MemPool::AllocRec(u32* cmp_ptr)
-{
-	void* mem;
-	if (pages.empty() || (pnt + DB_REC_LEN > MEM_PAGE_SIZE))
-	{
-		if (pages.size() >= MAX_PAGES_CNT)
-			return NULL; //overflow
-		pages.push_back(malloc(MEM_PAGE_SIZE));
-		pnt = 0;
-	}
-	u32 page_ind = (u32)pages.size() - 1;
-	mem = (u8*)pages[page_ind] + pnt;
-	*cmp_ptr = (page_ind * RECS_IN_PAGE) | (pnt / DB_REC_LEN);
-	pnt += DB_REC_LEN;
-	return mem;
-}
-
-void* MemPool::GetRecPtr(u32 cmp_ptr)
-{
-	u32 page_ind = cmp_ptr / RECS_IN_PAGE;
-	u32 rec_ind = cmp_ptr % RECS_IN_PAGE;
-	return (u8*)pages[page_ind] + DB_REC_LEN * rec_ind;
-}
-
-TFastBase::TFastBase()
-{
-	memset(lists, 0, sizeof(lists));
-	memset(Header, 0, sizeof(Header));
-}
-
-TFastBase::~TFastBase()
-{
-	Clear();
-}
-
-void TFastBase::Clear()
-{
-	for (int i = 0; i < 256; i++)
-	{
-		for (int j = 0; j < 256; j++)
-			for (int k = 0; k < 256; k++)
-			{
-				if (lists[i][j][k].data)
-					free(lists[i][j][k].data);
-				lists[i][j][k].data = NULL;
-				lists[i][j][k].capacity = 0;
-				lists[i][j][k].cnt = 0;
-			}
-		mps[i].Clear();
-	}
-}
-
-u64 TFastBase::GetBlockCnt()
-{
-	u64 blockCount = 0;
-	for (int i = 0; i < 256; i++)
-		for (int j = 0; j < 256; j++)
-			for (int k = 0; k < 256; k++)
-			blockCount += lists[i][j][k].cnt;
-	return blockCount;
-}
-
-// http://en.cppreference.com/w/cpp/algorithm/lower_bound
-int TFastBase::lower_bound(TListRec* list, int mps_ind, u8* data)
-{
-	int count = list->cnt;
-	int it, first, step;
-	first = 0;
-	while (count > 0)
-	{
-		it = first;
-		step = count / 2;   
-		it += step;
-		void* ptr = mps[mps_ind].GetRecPtr(list->data[it]);
-		if (memcmp(ptr, data, DB_FIND_LEN) < 0)
-		{
-			first = ++it;
-			count -= step + 1;
-		}
-		else
-			count = step;
-	}
-	return first;
-}
- 
-u8* TFastBase::AddDataBlock(u8* data, int pos)
-{
-	TListRec* list = &lists[data[0]][data[1]][data[2]];
-	if (list->cnt >= list->capacity)
-	{
-		u32 grow = list->capacity / 2;
-		if (grow < DB_MIN_GROW_CNT)
-			grow = DB_MIN_GROW_CNT;
-		u32 newcap = list->capacity + grow;
-		if (newcap > 0xFFFF)
-			newcap = 0xFFFF;
-		if (newcap <= list->capacity)
-			return NULL; //failed
-		list->data = (u32*)realloc(list->data, newcap * sizeof(u32));
-		list->capacity = newcap;
-	}
-	int first = (pos < 0) ? lower_bound(list, data[0], data + 3) : pos;
-	memmove(list->data + first + 1, list->data + first, (list->cnt - first) * sizeof(u32));
-	u32 cmp_ptr;
-	void* ptr = mps[data[0]].AllocRec(&cmp_ptr);
-	list->data[first] = cmp_ptr;
-	memcpy(ptr, data + 3, DB_REC_LEN);
-	list->cnt++;
-	return (u8*)ptr;
-}
-
-u8* TFastBase::FindDataBlock(u8* data)
-{
-	bool res = false;
-	TListRec* list = &lists[data[0]][data[1]][data[2]];
-	int first = lower_bound(list, data[0], data + 3);
-	if (first == list->cnt)
-		return NULL;
-	void* ptr = mps[data[0]].GetRecPtr(list->data[first]);
-	if (memcmp(ptr, data + 3, DB_FIND_LEN))
-		return NULL;
-	return (u8*)ptr;
-}
-
-u8* TFastBase::FindOrAddDataBlock(u8* data)
-{
-	void* ptr;
-	TListRec* list = &lists[data[0]][data[1]][data[2]];
-	int first = lower_bound(list, data[0], data + 3);
-	if (first == list->cnt)
-		goto label_not_found;
-	ptr = mps[data[0]].GetRecPtr(list->data[first]);
-	if (memcmp(ptr, data + 3, DB_FIND_LEN))
-		goto label_not_found;
-	return (u8*)ptr;
-label_not_found:
-	AddDataBlock(data, first);
-	return NULL;
-}
-
-//slow but I hope you are not going to create huge DB with this proof-of-concept software
-bool TFastBase::LoadFromFile(char* fn)
-{
-	Clear();
-	FILE* fp = fopen(fn, "rb");
-	if (!fp)
-		return false;
-	if (fread(Header, 1, sizeof(Header), fp) != sizeof(Header))
-	{
-		fclose(fp);
-		return false;
-	}
-	for (int i = 0; i < 256; i++)
-		for (int j = 0; j < 256; j++)
-			for (int k = 0; k < 256; k++)
-			{
-				TListRec* list = &lists[i][j][k];
-				fread(&list->cnt, 1, 2, fp);
-				if (list->cnt)
-				{
-					u32 grow = list->cnt / 2;
-					if (grow < DB_MIN_GROW_CNT)
-						grow = DB_MIN_GROW_CNT;
-					u32 newcap = list->cnt + grow;
-					if (newcap > 0xFFFF)
-						newcap = 0xFFFF;
-					list->data = (u32*)realloc(list->data, newcap * sizeof(u32));
-					list->capacity = newcap;
-
-					for (int m = 0; m < list->cnt; m++)
-					{
-						u32 cmp_ptr;
-						void* ptr = mps[i].AllocRec(&cmp_ptr);
-						list->data[m] = cmp_ptr;
-						if (fread(ptr, 1, DB_REC_LEN, fp) != DB_REC_LEN)
-						{
-							fclose(fp);
-							return false;
-						}
-					}
-				}
-			}
-	fclose(fp);
-	return true;
-}
-
-bool TFastBase::SaveToFile(char* fn)
-{
-	FILE* fp = fopen(fn, "wb");
-	if (!fp)
-		return false;
-	if (fwrite(Header, 1, sizeof(Header), fp) != sizeof(Header))
-	{
-		fclose(fp);
-		return false;
-	}
-	for (int i = 0; i < 256; i++)
-		for (int j = 0; j < 256; j++)
-			for (int k = 0; k < 256; k++)
-			{
-				TListRec* list = &lists[i][j][k];
-				fwrite(&list->cnt, 1, 2, fp);
-				for (int m = 0; m < list->cnt; m++)
-				{
-					void* ptr = mps[i].GetRecPtr(list->data[m]);
-					if (fwrite(ptr, 1, DB_REC_LEN, fp) != DB_REC_LEN)
-					{
-						fclose(fp);
-						return false;
-					}
-				}
-			}
-	fclose(fp);
-	return true;
-}
-
-bool IsFileExist(char* fn)
-{
-	FILE* fp = fopen(fn, "rb");
-	if (!fp)
-		return false;
-	fclose(fp);
-	return true;
+// This file is a part of RCKangaroo software
+// (c) 2024, RetiredCoder (RC)
+// License: GPLv3, see "LICENSE.TXT" file
+// https://github.com/RetiredC
+
+
+#include "utils.h"
+#include <wchar.h>
+static const char kTamesV15Tag[8] = {'T','M','B','M','1','5','\0','\0'};
+static const char kTamesV16Tag[8] = {'T','M','B','M','1','6','\0','\0'};
+
+
+#ifndef DB_FIND_LEN
+#define DB_FIND_LEN 5
+#endif
+#ifndef DB_REC_LEN
+#define DB_REC_LEN (DB_FIND_LEN + 22 + 1)
+#endif
+#ifdef _WIN32
+
+#else
+
+void _BitScanReverse64(u32* index, u64 msk) 
+{
+    *index = 63 - __builtin_clzll(msk); 
+}
+
+void _BitScanForward64(u32* index, u64 msk) 
+{
+    *index = __builtin_ffsll(msk) - 1; 
+}
+
+u64 _umul128(u64 m1, u64 m2, u64* hi) 
+{ 
+    uint128_t ab = (uint128_t)m1 * m2; *hi = (u64)(ab >> 64); return (u64)ab; 
+}
+
+u64 __shiftright128 (u64 LowPart, u64 HighPart, u8 Shift)
+{
+   u64 ret;
+   __asm__ ("shrd {%[Shift],%[HighPart],%[LowPart]|%[LowPart], %[HighPart], %[Shift]}" 
+      : [ret] "=r" (ret)
+      : [LowPart] "0" (LowPart), [HighPart] "r" (HighPart), [Shift] "Jc" (Shift)
+      : "cc");
+   return ret;
+}
+
+u64 __shiftleft128 (u64 LowPart, u64 HighPart, u8 Shift)
+{
+   u64 ret;
+   __asm__ ("shld {%[Shift],%[LowPart],%[HighPart]|%[HighPart], %[LowPart], %[Shift]}" 
+      : [ret] "=r" (ret)
+      : [LowPart] "r" (LowPart), [HighPart] "0" (HighPart), [Shift] "Jc" (Shift)
+      : "cc");
+   return ret;
+}   
+
+u64 GetTickCount64()
+{
+	struct timespec ts;
+	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+	return (u64)(ts.tv_nsec / 1000000) + ((u64)ts.tv_sec * 1000ull);
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define DB_REC_LEN			32
+#define DB_FIND_LEN			9
+#define DB_MIN_GROW_CNT		2
+
+//we need advanced memory management to reduce memory fragmentation
+//everything will be stable up to about 8TB RAM
+
+#define MEM_PAGE_SIZE		(128 * 1024)
+#define RECS_IN_PAGE		(MEM_PAGE_SIZE / DB_REC_LEN)
+#define MAX_PAGES_CNT		(0xFFFFFFFF / RECS_IN_PAGE)
+
+MemPool::MemPool()
+{
+	pnt = 0;
+}
+
+MemPool::~MemPool()
+{
+	Clear();
+}
+
+void MemPool::Clear()
+{
+	int cnt = (int)pages.size();
+	for (int i = 0; i < cnt; i++)
+		free(pages[i]);
+	pages.clear();
+	pnt = 0;
+}
+
+void* MemPool::AllocRec(u32* cmp_ptr)
+{
+	void* mem;
+	if (pages.empty() || (pnt + DB_REC_LEN > MEM_PAGE_SIZE))
+	{
+		if (pages.size() >= MAX_PAGES_CNT)
+			return NULL; //overflow
+		pages.push_back(malloc(MEM_PAGE_SIZE));
+		pnt = 0;
+	}
+	u32 page_ind = (u32)pages.size() - 1;
+	mem = (u8*)pages[page_ind] + pnt;
+	*cmp_ptr = (page_ind * RECS_IN_PAGE) | (pnt / DB_REC_LEN);
+	pnt += DB_REC_LEN;
+	return mem;
+}
+
+void* MemPool::GetRecPtr(u32 cmp_ptr)
+{
+	u32 page_ind = cmp_ptr / RECS_IN_PAGE;
+	u32 rec_ind = cmp_ptr % RECS_IN_PAGE;
+	return (u8*)pages[page_ind] + DB_REC_LEN * rec_ind;
+}
+
+TFastBase::TFastBase()
+{
+	memset(lists, 0, sizeof(lists));
+	memset(Header, 0, sizeof(Header));
+}
+
+TFastBase::~TFastBase()
+{
+	Clear();
+}
+
+void TFastBase::Clear()
+{
+	for (int i = 0; i < 256; i++)
+	{
+		for (int j = 0; j < 256; j++)
+			for (int k = 0; k < 256; k++)
+			{
+				if (lists[i][j][k].data)
+					free(lists[i][j][k].data);
+				lists[i][j][k].data = NULL;
+				lists[i][j][k].capacity = 0;
+				lists[i][j][k].cnt = 0;
+			}
+		mps[i].Clear();
+	}
+}
+
+u64 TFastBase::GetBlockCnt()
+{
+	u64 blockCount = 0;
+	for (int i = 0; i < 256; i++)
+		for (int j = 0; j < 256; j++)
+			for (int k = 0; k < 256; k++)
+			blockCount += lists[i][j][k].cnt;
+	return blockCount;
+}
+
+// http://en.cppreference.com/w/cpp/algorithm/lower_bound
+int TFastBase::lower_bound(TListRec* list, int mps_ind, u8* data)
+{
+	int count = list->cnt;
+	int it, first, step;
+	first = 0;
+	while (count > 0)
+	{
+		it = first;
+		step = count / 2;   
+		it += step;
+		void* ptr = mps[mps_ind].GetRecPtr(list->data[it]);
+		if (memcmp(ptr, data, DB_FIND_LEN) < 0)
+		{
+			first = ++it;
+			count -= step + 1;
+		}
+		else
+			count = step;
+	}
+	return first;
+}
+ 
+u8* TFastBase::AddDataBlock(u8* data, int pos)
+{
+	TListRec* list = &lists[data[0]][data[1]][data[2]];
+	if (list->cnt >= list->capacity)
+	{
+		u32 grow = list->capacity / 2;
+		if (grow < DB_MIN_GROW_CNT)
+			grow = DB_MIN_GROW_CNT;
+		u32 newcap = list->capacity + grow;
+		if (newcap > 0xFFFF)
+			newcap = 0xFFFF;
+		if (newcap <= list->capacity)
+			return NULL; //failed
+		list->data = (u32*)realloc(list->data, newcap * sizeof(u32));
+		list->capacity = newcap;
+	}
+	int first = (pos < 0) ? lower_bound(list, data[0], data + 3) : pos;
+	memmove(list->data + first + 1, list->data + first, (list->cnt - first) * sizeof(u32));
+	u32 cmp_ptr;
+	void* ptr = mps[data[0]].AllocRec(&cmp_ptr);
+	list->data[first] = cmp_ptr;
+	memcpy(ptr, data + 3, DB_REC_LEN);
+	list->cnt++;
+	return (u8*)ptr;
+}
+
+u8* TFastBase::FindDataBlock(u8* data)
+{
+	bool res = false;
+	TListRec* list = &lists[data[0]][data[1]][data[2]];
+	int first = lower_bound(list, data[0], data + 3);
+	if (first == list->cnt)
+		return NULL;
+	void* ptr = mps[data[0]].GetRecPtr(list->data[first]);
+	if (memcmp(ptr, data + 3, DB_FIND_LEN))
+		return NULL;
+	return (u8*)ptr;
+}
+
+u8* TFastBase::FindOrAddDataBlock(u8* data)
+{
+	void* ptr;
+	TListRec* list = &lists[data[0]][data[1]][data[2]];
+	int first = lower_bound(list, data[0], data + 3);
+	if (first == list->cnt)
+		goto label_not_found;
+	ptr = mps[data[0]].GetRecPtr(list->data[first]);
+	if (memcmp(ptr, data + 3, DB_FIND_LEN))
+		goto label_not_found;
+	return (u8*)ptr;
+label_not_found:
+	AddDataBlock(data, first);
+	return NULL;
+}
+
+//slow but I hope you are not going to create huge DB with this proof-of-concept software
+bool TFastBase::LoadFromFile(char* fn)
+{
+Clear();
+    FILE* fp = fopen(fn, "rb");
+    if (!fp) return false;
+
+    if (fread(Header, 1, sizeof(Header), fp) != sizeof(Header)) {
+        fclose(fp); return false;
+    }
+
+    long pos_after_header = ftell(fp);
+    char tag[8];
+    size_t got = fread(tag, 1, sizeof(tag), fp);
+    bool use_v15 = (got == sizeof(tag) && memcmp(tag, kTamesV15Tag, 8) == 0);
+    bool use_v16 = (got == sizeof(tag) && memcmp(tag, kTamesV16Tag, 8) == 0);
+
+    if (!use_v15 && !use_v16) {
+        // --- Formato legado ---
+        fseek(fp, pos_after_header, SEEK_SET);
+
+        for (int i = 0; i < 256; i++)
+            for (int j = 0; j < 256; j++)
+                for (int k = 0; k < 256; k++)
+                {
+                    TListRec* list = &lists[i][j][k];
+
+                    if (fread(&list->cnt, 2, 1, fp) != 1) { fclose(fp); return false; }
+
+                    if (list->cnt) {
+                        // asegurar capacidad para list->data (u32*), no vector
+                        if (list->capacity < list->cnt) {
+                            unsigned short newcap = list->cnt;
+                            if (newcap < 16) newcap = 16;
+                            if (newcap > 0xFFFF) newcap = 0xFFFF;
+                            list->data = (u32*)realloc(list->data, (size_t)newcap * sizeof(u32));
+                            if (!list->data) { fclose(fp); return false; }
+                            list->capacity = newcap;
+                        }
+
+                        for (int m = 0; m < list->cnt; m++) {
+                            u32 cmp_ptr;
+                            void* ptr = mps[i].AllocRec(&cmp_ptr);
+                            list->data[m] = cmp_ptr;
+
+                            if (fread(ptr, 1, DB_REC_LEN, fp) != DB_REC_LEN) {
+                                fclose(fp); return false;
+                            }
+                        }
+                    }
+                }
+
+        fclose(fp);
+        return true;
+    }
+
+    // --- V1.5: bitmap + bulk por (i,j) ---
+    // selecc. tamaño por versión
+    size_t rec_len = use_v16 ? (size_t)DB_REC_LEN : (size_t)32;
+    for (int i = 0; i < 256; i++) {
+        for (int j = 0; j < 256; j++) {
+
+            unsigned char bitmap[32];
+            if (fread(bitmap, 1, 32, fp) != 32) { fclose(fp); return false; }
+
+            for (int k = 0; k < 256; k++) {
+                TListRec* list = &lists[i][j][k];
+
+                if (bitmap[k >> 3] & (1u << (k & 7))) {
+                    unsigned short cnt16;
+                    if (fread(&cnt16, 2, 1, fp) != 1) { fclose(fp); return false; }
+
+                    // asegurar capacidad (u32*)
+                    if (list->capacity < cnt16) {
+                        unsigned short newcap = cnt16;
+                        if (newcap < 16) newcap = 16;
+                        if (newcap > 0xFFFF) newcap = 0xFFFF;
+                        list->data = (u32*)realloc(list->data, (size_t)newcap * sizeof(u32));
+                        if (!list->data) { fclose(fp); return false; }
+                        list->capacity = newcap;
+                    }
+                    list->cnt = cnt16;
+
+                    size_t bytes = (size_t)cnt16 * rec_len;
+                    if (bytes) {
+                        std::vector<unsigned char> buf; buf.resize(bytes);
+                        if (fread(buf.data(), 1, bytes, fp) != bytes) { fclose(fp); return false; }
+
+                        for (int m = 0; m < cnt16; m++) {
+                            u32 cmp_ptr;
+                            void* dst = mps[i].AllocRec(&cmp_ptr);
+                            list->data[m] = cmp_ptr;
+                            memcpy(dst, &buf[(size_t)m * rec_len], rec_len);
+                        }
+                    }
+                } else {
+                    list->cnt = 0;
+                    // conservamos capacidad existente, no tocamos list->data
+                }
+            }
+        }
+    }
+
+    fclose(fp);
+    return true;
+}
+
+
+bool TFastBase::SaveToFile(char* fn)
+{
+FILE* fp = fopen(fn, "wb");
+    if (!fp) return false;
+
+    if (fwrite(Header, 1, sizeof(Header), fp) != sizeof(Header)) {
+        fclose(fp); return false;
+    }
+
+    // --- TAG V1.5 ---
+    if (fwrite(kTamesV16Tag, 1, sizeof(kTamesV16Tag), fp) != sizeof(kTamesV16Tag)) {
+        fclose(fp); return false;
+    }
+
+    size_t rec_len = (size_t)DB_REC_LEN;
+    // (i,j): bitmap de 256 bits + por cada k set: cnt (u16) y bloque contiguo de registros
+    for (int i = 0; i < 256; i++) {
+        for (int j = 0; j < 256; j++) {
+
+            unsigned char bitmap[32]; memset(bitmap, 0, sizeof(bitmap));
+            for (int k = 0; k < 256; k++) {
+                TListRec* list = &lists[i][j][k];
+                if (list->cnt) bitmap[k >> 3] |= (1u << (k & 7));
+            }
+            if (fwrite(bitmap, 1, 32, fp) != 32) { fclose(fp); return false; }
+
+            for (int k = 0; k < 256; k++) if (bitmap[k >> 3] & (1u << (k & 7))) {
+                TListRec* list = &lists[i][j][k];
+                unsigned short cnt16 = (unsigned short)list->cnt;
+                if (fwrite(&cnt16, 2, 1, fp) != 1) { fclose(fp); return false; }
+
+                size_t bytes = (size_t)cnt16 * rec_len;
+                if (bytes) {
+                    // Junta en buffer para un solo fwrite
+                    std::vector<unsigned char> buf; buf.resize(bytes);
+                    for (int m = 0; m < cnt16; m++) {
+                        void* ptr = mps[i].GetRecPtr(list->data[m]);
+                        memcpy(&buf[(size_t)m * DB_REC_LEN], ptr, DB_REC_LEN);
+                    }
+                    if (fwrite(buf.data(), 1, bytes, fp) != bytes) { fclose(fp); return false; }
+                }
+            }
+        }
+    }
+
+    fclose(fp);
+    return true;
+}
+
+
+bool IsFileExist(char* fn)
+{
+	FILE* fp = fopen(fn, "rb");
+	if (!fp)
+		return false;
+	fclose(fp);
+	return true;
 }
\ No newline at end of file