diff --git a/src/Nethermind/Nethermind.Benchmark/State/ReadOnlySnapshotBundleBenchmark.cs b/src/Nethermind/Nethermind.Benchmark/State/ReadOnlySnapshotBundleBenchmark.cs index 98f615509c19..a4582c530b73 100644 --- a/src/Nethermind/Nethermind.Benchmark/State/ReadOnlySnapshotBundleBenchmark.cs +++ b/src/Nethermind/Nethermind.Benchmark/State/ReadOnlySnapshotBundleBenchmark.cs @@ -14,6 +14,7 @@ using Nethermind.Logging; using Nethermind.State.Flat; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.State.Flat.ScopeProvider; using Nethermind.Trie; using FlatSnapshot = Nethermind.State.Flat.Snapshot; @@ -73,7 +74,6 @@ public void Setup() int storageAccountCount = 20 * multiplier; int slotsPerStorageAccount = 100 * multiplier; - // Build ReadOnlySnapshotBundle from previously captured snapshots SnapshotPooledList prevSnapshots = new(allSnapshots.Count); foreach (FlatSnapshot s in allSnapshots) { @@ -81,8 +81,10 @@ public void Setup() prevSnapshots.Add(s); } + // Build ReadOnlySnapshotBundle from previously captured snapshots ReadOnlySnapshotBundle readOnly = new( - prevSnapshots, new NoopPersistenceReader(), recordDetailedMetrics: false); + prevSnapshots, new NoopPersistenceReader(), recordDetailedMetrics: false, + PersistedSnapshotStack.Empty()); NullTrieNodeCache cache = new(); SnapshotBundle bundle = new( readOnly, cache, resourcePool, ResourcePool.Usage.MainBlockProcessing); @@ -154,7 +156,6 @@ public void Setup() maxSlotsPerStorageAccount = slotsPerStorageAccount; } - // Build final ReadOnlySnapshotBundle with all 8 snapshots SnapshotPooledList finalSnapshots = new(allSnapshots.Count); foreach (FlatSnapshot s in allSnapshots) { @@ -162,8 +163,10 @@ public void Setup() finalSnapshots.Add(s); } + // Build final ReadOnlySnapshotBundle with all 8 snapshots _bundle = new ReadOnlySnapshotBundle( - finalSnapshots, new NoopPersistenceReader(), recordDetailedMetrics: false); + finalSnapshots, new NoopPersistenceReader(), recordDetailedMetrics: false, + PersistedSnapshotStack.Empty()); // --- Hit arrays --- _hitAccounts = new Address[ArraySize]; diff --git a/src/Nethermind/Nethermind.Benchmark/State/WriteBatchBenchmark.cs b/src/Nethermind/Nethermind.Benchmark/State/WriteBatchBenchmark.cs index 8abbc86b8200..147723cc7bed 100644 --- a/src/Nethermind/Nethermind.Benchmark/State/WriteBatchBenchmark.cs +++ b/src/Nethermind/Nethermind.Benchmark/State/WriteBatchBenchmark.cs @@ -13,6 +13,7 @@ using Nethermind.Logging; using Nethermind.State.Flat; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.State.Flat.ScopeProvider; using Nethermind.Trie; using FlatSnapshot = Nethermind.State.Flat.Snapshot; @@ -65,7 +66,8 @@ public void GlobalSetup() } ReadOnlySnapshotBundle readOnly = new( - prevSnapshots, new NoopPersistenceReader(), recordDetailedMetrics: false); + prevSnapshots, new NoopPersistenceReader(), recordDetailedMetrics: false, + PersistedSnapshotStack.Empty()); NullTrieNodeCache cache = new(); SnapshotBundle bundle = new( readOnly, cache, _resourcePool, ResourcePool.Usage.MainBlockProcessing); @@ -147,7 +149,8 @@ public void IterationSetup() } ReadOnlySnapshotBundle readOnly = new( - prevSnapshots, new NoopPersistenceReader(), recordDetailedMetrics: false); + prevSnapshots, new NoopPersistenceReader(), recordDetailedMetrics: false, + PersistedSnapshotStack.Empty()); NullTrieNodeCache cache = new(); SnapshotBundle bundle = new( readOnly, cache, _resourcePool, ResourcePool.Usage.MainBlockProcessing); diff --git a/src/Nethermind/Nethermind.Core/Utils/SmallRefCountingDisposable.cs b/src/Nethermind/Nethermind.Core/Utils/SmallRefCountingDisposable.cs new file mode 100644 index 000000000000..2096ee03760a --- /dev/null +++ b/src/Nethermind/Nethermind.Core/Utils/SmallRefCountingDisposable.cs @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Threading; + +namespace Nethermind.Core.Utils; + +/// +/// Variant of that stores its lease counter inline as a single +/// instead of a cache-line-padded one, trading false-sharing protection for a much +/// smaller per-instance footprint. Prefer it for types that exist in large numbers and whose lease +/// counts are rarely contended across cores. +/// +public abstract class SmallRefCountingDisposable(int initialCount = 1) : IDisposable +{ + private const int Single = 1; + private const int NoAccessors = 0; + private const int Disposing = -1; + + private long _leases = initialCount; + + public void AcquireLease() + { + if (!TryAcquireLease()) + { + ThrowCouldNotAcquire(); + } + + [DoesNotReturn] + [StackTraceHidden] + static void ThrowCouldNotAcquire() => throw new InvalidOperationException("The lease cannot be acquired"); + } + + protected bool TryAcquireLease() + { + // Volatile read for starting value + long current = Volatile.Read(ref _leases); + + while (true) + { + // Reject once the count has reached zero (NoAccessors) or gone to Disposing: the object is + // being torn down. Acquiring at NoAccessors would resurrect an object whose owner has + // already observed the zero count and begun teardown — the release path moves the count + // 1 → 0 and only then CASes 0 → Disposing, so a concurrent acquirer can briefly see 0. + // Checking inside the loop (not just on the initial read) also closes the window where a + // failed CAS hands back a now-zero count. + if (current <= NoAccessors) + { + return false; + } + + long prev = Interlocked.CompareExchange(ref _leases, current + Single, current); + if (prev == current) + { + // Successfully acquired + return true; + } + + // Try again with the observed value + current = prev; + // Add PAUSE instruction to reduce shared core contention + Thread.SpinWait(1); + } + } + + /// + /// Disposes it once, decreasing the lease count by 1. + /// + public void Dispose() => ReleaseLeaseOnce(); + + private void ReleaseLeaseOnce() + { + // Volatile read for starting value + long current = Volatile.Read(ref _leases); + if (current <= NoAccessors) + { + // Mismatched Acquire/Release + ThrowOverDisposed(); + } + + while (true) + { + long prev = Interlocked.CompareExchange(ref _leases, current - Single, current); + if (prev != current) + { + current = prev; + // Add PAUSE instruction to reduce shared core contention + Thread.SpinWait(1); + continue; + } + if (prev == Single) + { + // Last use, try to dispose underlying + break; + } + if (prev <= NoAccessors) + { + // Mismatched Acquire/Release + ThrowOverDisposed(); + } + + // Successfully released + return; + } + + if (Interlocked.CompareExchange(ref _leases, Disposing, NoAccessors) == NoAccessors) + { + // set to disposed by this Release + CleanUp(); + } + + [DoesNotReturn] + [StackTraceHidden] + static void ThrowOverDisposed() => throw new ObjectDisposedException("The lease has already been disposed"); + } + + protected abstract void CleanUp(); + + public override string ToString() + { + long leases = Volatile.Read(ref _leases); + return leases == Disposing ? "Disposed" : $"Leases: {leases}"; + } +} diff --git a/src/Nethermind/Nethermind.Db/DbNames.cs b/src/Nethermind/Nethermind.Db/DbNames.cs index ef8355873016..c576515e0a57 100644 --- a/src/Nethermind/Nethermind.Db/DbNames.cs +++ b/src/Nethermind/Nethermind.Db/DbNames.cs @@ -23,5 +23,6 @@ public static class DbNames public const string PeersDb = "peers"; public const string LogIndex = "logIndex"; public const string Preimage = "preimage"; + public const string PersistedSnapshotCatalog = "persistedSnapshotCatalog"; } } diff --git a/src/Nethermind/Nethermind.Db/FlatDbConfig.cs b/src/Nethermind/Nethermind.Db/FlatDbConfig.cs index 3442d43504b2..49e440ce1111 100644 --- a/src/Nethermind/Nethermind.Db/FlatDbConfig.cs +++ b/src/Nethermind/Nethermind.Db/FlatDbConfig.cs @@ -24,4 +24,13 @@ public class FlatDbConfig : IFlatDbConfig public long BlockCacheSizeBudget { get; set; } = 1.GiB; public long CompactionOffset { get; set; } = -1; public long TrieCacheMemoryBudget { get; set; } = 512.MiB; + public bool EnableLongFinality { get; set; } = false; + public int LongFinalityMaxReorgDepth { get; set; } = 90000; + public int MaxInMemoryBaseSnapshotCount { get; set; } = 128; + public long ArenaFileSizeBytes { get; set; } = 1.GiB; + public long PersistedSnapshotDedicatedArenaThresholdBytes { get; set; } = 1.GiB; + public bool PersistedSnapshotPunchHoleOnReclaim { get; set; } = true; + public int PersistedSnapshotMaxCompactSize { get; set; } = 1024 * 1024; + public bool ValidatePersistedSnapshot { get; set; } = false; + public double PersistedSnapshotBloomBitsPerKey { get; set; } = 14.0; } diff --git a/src/Nethermind/Nethermind.Db/IFlatDbConfig.cs b/src/Nethermind/Nethermind.Db/IFlatDbConfig.cs index 021022328484..895f03dca283 100644 --- a/src/Nethermind/Nethermind.Db/IFlatDbConfig.cs +++ b/src/Nethermind/Nethermind.Db/IFlatDbConfig.cs @@ -34,7 +34,7 @@ public interface IFlatDbConfig : IConfig [ConfigItem(Description = "Max in flight compact job", DefaultValue = "32")] int MaxInFlightCompactJob { get; set; } - [ConfigItem(Description = "Max reorg depth", DefaultValue = "256")] + [ConfigItem(Description = "Max reorg depth — the force-persist backstop used when EnableLongFinality is off: once the in-memory depth exceeds it while finality is stalled, persistence is forced to bound memory.", DefaultValue = "256")] int MaxReorgDepth { get; set; } [ConfigItem(Description = "Minimum reorg depth", DefaultValue = "128")] @@ -55,6 +55,33 @@ public interface IFlatDbConfig : IConfig [ConfigItem(Description = "Verify with trie", DefaultValue = "false")] bool VerifyWithTrie { get; set; } + [ConfigItem(Description = "Enable long finality support with persisted snapshots", DefaultValue = "false")] + bool EnableLongFinality { get; set; } + + [ConfigItem(Description = "Force-persist backstop used when EnableLongFinality is on, in place of MaxReorgDepth. The persisted-snapshot tier serves deep reorgs, so this is much larger than the non-long-finality backstop.", DefaultValue = "90000")] + int LongFinalityMaxReorgDepth { get; set; } + + [ConfigItem(Description = "Maximum number of in-memory base snapshots before conversion to the persisted-snapshot tier kicks in. Counted as `SnapshotCount` of the in-memory repository, not a block-distance depth.", DefaultValue = "128")] + int MaxInMemoryBaseSnapshotCount { get; set; } + + [ConfigItem(Description = "Maximum size in bytes for a single arena file before a new one is started.", DefaultValue = "1073741824")] + long ArenaFileSizeBytes { get; set; } + + [ConfigItem(Description = "Estimated-size threshold (bytes) at or above which a persisted-snapshot arena write goes to its own dedicated file instead of being packed into a shared arena.", DefaultValue = "1073741824")] + long PersistedSnapshotDedicatedArenaThresholdBytes { get; set; } + + [ConfigItem(Description = "When reclaiming dead persisted-snapshot arena ranges — metadata reservation cleanup and blob-file frontier reset — call fallocate(FALLOC_FL_PUNCH_HOLE) to free the underlying disk blocks. Linux-only; automatically and permanently disabled per arena pool if the filesystem reports the operation unsupported. Set false to skip hole-punching entirely (the page-cache posix_fadvise still runs).", DefaultValue = "true")] + bool PersistedSnapshotPunchHoleOnReclaim { get; set; } + + [ConfigItem(Description = "Max persisted snapshot compaction size (hierarchical compaction ceiling for persisted layer), in blocks", DefaultValue = "1048576")] + int PersistedSnapshotMaxCompactSize { get; set; } + + [ConfigItem(Description = "Validate persisted snapshots against in-memory snapshots after conversion (debug/diagnostic only)", DefaultValue = "false")] + bool ValidatePersistedSnapshot { get; set; } + + [ConfigItem(Description = "Bits per key for the per-snapshot in-memory bloom filter. One unified filter covers address/slot/self-destruct keys plus state-trie and storage-trie node paths. Higher = lower false-positive rate but more RAM. 0 disables the filter (lookups behave as full sweeps).", DefaultValue = "14.0")] + double PersistedSnapshotBloomBitsPerKey { get; set; } + [ConfigItem(Description = "Persistent dedicated reader threads used to resolve hinted BAL read sets into the pre-block cache. -1 for 4x logical processor count capped at 64. Values below 1 are clamped to 1. Use --Blocks.ParallelExecutionBatchRead=false to disable BAL warming entirely.", DefaultValue = "-1")] int WarmReadConcurrency { get; set; } } diff --git a/src/Nethermind/Nethermind.Init/Modules/FlatWorldStateModule.cs b/src/Nethermind/Nethermind.Init/Modules/FlatWorldStateModule.cs index d00faa04bd6a..20e7baa805c5 100644 --- a/src/Nethermind/Nethermind.Init/Modules/FlatWorldStateModule.cs +++ b/src/Nethermind/Nethermind.Init/Modules/FlatWorldStateModule.cs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-only using System; +using System.IO; using Autofac; using Nethermind.Api.Steps; using Nethermind.Blockchain; @@ -17,9 +18,12 @@ using Nethermind.JsonRpc.Modules.Admin; using Nethermind.Logging; using Nethermind.Monitoring.Config; +using Nethermind.Api; using Nethermind.State.Flat; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.State.Flat.ScopeProvider; +using Nethermind.State.Flat.PersistedSnapshots.Storage; using Nethermind.State.Flat.Sync; using Nethermind.State.Flat.Sync.Snap; @@ -46,6 +50,7 @@ protected override void Load(ContainerBuilder builder) ctx.Resolve(), ctx.Resolve(), ctx.Resolve(), + ctx.Resolve(), ctx.Resolve(), ctx.Resolve(), ctx.Resolve(), @@ -55,7 +60,26 @@ protected override void Load(ContainerBuilder builder) .AddSingleton() .AddSingleton() .AddSingleton() + // Shared ArenaManager + BlobArenaManager singletons: the persisted-snapshot repo and + // the compactor MUST resolve the same instances, otherwise compaction would write + // through a different mmap than the repository reads from. + .AddSingleton((cfg, initConfig, logManager) => + { + string basePath = Path.Combine(initConfig.BaseDbPath, "persisted_snapshot"); + return new ArenaManager(Path.Combine(basePath, "arena"), cfg, logManager); + }) + .AddSingleton(ctx => ctx.Resolve()) + .AddSingleton((cfg, initConfig) => + { + string basePath = Path.Combine(initConfig.BaseDbPath, "persisted_snapshot"); + return new BlobArenaManager( + Path.Combine(basePath, "blob"), + cfg.ArenaFileSizeBytes); + }) + .AddSingleton() .AddSingleton() + // Registered after ISnapshotRepository so DI disposes it first. + .AddSingleton() .AddSingleton(flatDbConfig.TrieWarmerWorkerCount == 0 ? _ => new NoopTrieWarmer() : ctx => ctx.Resolve()) @@ -72,6 +96,17 @@ protected override void Load(ContainerBuilder builder) // Persistences .AddColumnDatabase(DbNames.Flat) + // Persisted snapshot catalog: dedicated RocksDB co-located with the arena/blob files it + // indexes under /persisted_snapshot/catalog/. Wiping persisted_snapshot/ + // therefore wipes the catalog alongside the data. + .AddKeyedSingleton(DbNames.PersistedSnapshotCatalog, ctx => ctx + .Resolve() + .CreateDb(new DbSettings( + nameof(DbNames.PersistedSnapshotCatalog), + Path.Combine("persisted_snapshot", "catalog")))) + .AddSingleton(ctx => + new SnapshotCatalog(ctx.ResolveKeyed(DbNames.PersistedSnapshotCatalog))) + .AddSingleton(ctx => ctx.Resolve()) .AddSingleton() .AddSingleton() .AddDecorator() @@ -99,6 +134,20 @@ protected override void Load(ContainerBuilder builder) }) ; + // EnableLongFinality off: inert the whole persisted tier. The Null loader skips loading any + // on-disk tier at startup and never converts in-memory snapshots into it; the Null catalog keeps + // it empty (nothing recorded or loaded); the Null compactor runs no background compaction. The + // conversion paths in PersistenceManager.DetermineSnapshotAction are also gated on this flag. + // SnapshotRepository still constructs its arena/blob/catalog stores under + // `/persisted_snapshot/`, but they stay empty and unread. + if (!flatDbConfig.EnableLongFinality) + { + builder + .AddSingleton(NullSnapshotCatalog.Instance) + .AddSingleton(NullPersistedSnapshotLoader.Instance) + .AddSingleton(NullPersistedSnapshotCompactor.Instance); + } + if (flatDbConfig.ImportFromPruningTrieState) { builder diff --git a/src/Nethermind/Nethermind.State.Flat.Test/ArenaMetricsTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/ArenaMetricsTests.cs new file mode 100644 index 000000000000..bbd6a86fac41 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/ArenaMetricsTests.cs @@ -0,0 +1,123 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.IO; +using Nethermind.Db; +using Nethermind.Logging; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +/// +/// Arena / blob allocated-bytes gauges. Verifies that the metric reflects +/// Frontier (bytes actually written), not the pre-extended sparse mmap size, and +/// that arena vs blob files surface in distinct gauges. +/// +[TestFixture] +public class ArenaMetricsTests +{ + private string _testDir = null!; + + [SetUp] + public void SetUp() + { + _testDir = Path.Combine(Path.GetTempPath(), $"nm_arena_metrics_{Guid.NewGuid():N}"); + Directory.CreateDirectory(_testDir); + } + + [TearDown] + public void TearDown() + { + try { Directory.Delete(_testDir, recursive: true); } catch { /* best-effort */ } + } + + [Test] + public void ArenaWriter_Complete_AdvancesAllocatedBytes_ByFrontierDelta_NotMappedSize() + { + // Use a delta from the baseline so parallel-running tests don't interfere. + const long maxArenaSize = 64 * 1024; // 64 KiB sparse arena file + const int payloadBytes = 4096; + + long arenaBytesBefore = Metrics.ArenaAllocatedBytes; + long arenaCountBefore = Metrics.ArenaFileCount; + long blobBytesBefore = Metrics.BlobAllocatedBytes; + long blobCountBefore = Metrics.BlobFileCount; + long resvBytesBefore = Metrics.ArenaReservationBytes; + + string arenaDir = Path.Combine(_testDir, "arena"); + using ArenaManager arena = new(arenaDir, new FlatDbConfig + { + ArenaFileSizeBytes = maxArenaSize, + }, LimboLogs.Instance); + + // Before any write the file isn't materialised yet (CreateArenaFile fires on first writer). + Assert.That(Metrics.ArenaAllocatedBytes, Is.EqualTo(arenaBytesBefore)); + Assert.That(Metrics.ArenaFileCount, Is.EqualTo(arenaCountBefore)); + + ArenaReservation reservation; + using (ArenaWriter writer = arena.CreateWriter(payloadBytes)) + { + // File materialised — count +1, allocated bytes still 0 (frontier == 0 at open). + Assert.That(Metrics.ArenaFileCount, Is.EqualTo(arenaCountBefore + 1)); + Assert.That(Metrics.ArenaAllocatedBytes, Is.EqualTo(arenaBytesBefore)); + + ref ArenaBufferWriter buf = ref writer.GetWriter(); + buf.GetSpan(payloadBytes).Clear(); + buf.Advance(payloadBytes); + (_, reservation) = writer.Complete(); + } + + // After Complete the frontier delta lands in ArenaAllocatedBytes — exactly the + // payload size, NOT the 64 KiB sparse MaxSize. + Assert.That((Metrics.ArenaAllocatedBytes - arenaBytesBefore), Is.EqualTo(payloadBytes)); + + Assert.That((Metrics.ArenaReservationBytes - resvBytesBefore), Is.EqualTo(payloadBytes)); + + // Arena and blob gauges are independent — no blob activity here. + Assert.That(Metrics.BlobAllocatedBytes, Is.EqualTo(blobBytesBefore)); + Assert.That(Metrics.BlobFileCount, Is.EqualTo(blobCountBefore)); + + // Dropping the reservation marks all its bytes dead → MarkDead drops the file → + // OnArenaRemoved returns the count and allocated-bytes contributions to baseline. + reservation.Dispose(); + Assert.That(Metrics.ArenaReservationBytes, Is.EqualTo(resvBytesBefore)); + Assert.That(Metrics.ArenaFileCount, Is.EqualTo(arenaCountBefore)); + Assert.That(Metrics.ArenaAllocatedBytes, Is.EqualTo(arenaBytesBefore)); + } + + [Test] + public void BlobArenaWriter_Complete_AdvancesBlobAllocatedBytes_AndKeepsArenaGaugeAtZero() + { + const long maxFileSize = 64 * 1024; + const int blobBytes = 1024; + + long arenaBytesBefore = Metrics.ArenaAllocatedBytes; + long arenaCountBefore = Metrics.ArenaFileCount; + long blobBytesBefore = Metrics.BlobAllocatedBytes; + long blobCountBefore = Metrics.BlobFileCount; + + string blobDir = Path.Combine(_testDir, "blob"); + using BlobArenaManager blobs = new(blobDir, maxFileSize); + + using (BlobArenaWriter writer = blobs.CreateWriter(blobBytes)) + { + // File materialised on first writer — count +1, allocated still 0. + Assert.That(Metrics.BlobFileCount, Is.EqualTo(blobCountBefore + 1)); + Assert.That(Metrics.BlobAllocatedBytes, Is.EqualTo(blobBytesBefore)); + + byte[] rlp = new byte[blobBytes]; + writer.WriteRlp(rlp); + writer.Complete(); + } + + // After Complete: blob allocated bytes advance by exactly the written size (not the + // 64 KiB MaxSize of the sparse file). + Assert.That((Metrics.BlobAllocatedBytes - blobBytesBefore), Is.EqualTo(blobBytes)); + + // Arena gauges stay flat — blob writes never touch them. + Assert.That(Metrics.ArenaAllocatedBytes, Is.EqualTo(arenaBytesBefore)); + Assert.That(Metrics.ArenaFileCount, Is.EqualTo(arenaCountBefore)); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/ArenaReclaimPunchHoleTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/ArenaReclaimPunchHoleTests.cs new file mode 100644 index 000000000000..bb77c28dc9a5 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/ArenaReclaimPunchHoleTests.cs @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.Diagnostics; +using System.IO; +using System.Linq; +using Nethermind.Db; +using Nethermind.Logging; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +/// +/// Verifies that dead persisted-snapshot arena ranges have their disk blocks reclaimed via +/// fallocate(FALLOC_FL_PUNCH_HOLE) — on metadata-reservation cleanup and on blob-file +/// frontier reset — and that the PersistedSnapshotPunchHoleOnReclaim flag gates it. +/// Linux-only; gracefully ignored when the temp filesystem does not support hole-punching. +/// +[TestFixture] +public class ArenaReclaimPunchHoleTests +{ + private string _testDir = null!; + + [SetUp] + public void SetUp() + { + _testDir = Path.Combine(Path.GetTempPath(), $"nm_punchhole_{Guid.NewGuid():N}"); + Directory.CreateDirectory(_testDir); + } + + [TearDown] + public void TearDown() + { + try { Directory.Delete(_testDir, recursive: true); } catch { /* best-effort */ } + } + + [TestCase(true)] + [TestCase(false)] + public void ReservationCleanup_PunchesHole_ForDeadRange_WhenEnabled(bool punchHoleOnReclaim) + { + if (!OperatingSystem.IsLinux()) Assert.Ignore("fallocate punch-hole is Linux-only"); + int pageSize = Environment.SystemPageSize; + string arenaDir = Path.Combine(_testDir, "arena"); + + using ArenaManager manager = new(arenaDir, new FlatDbConfig + { + ArenaFileSizeBytes = 8L * 1024 * 1024, + PersistedSnapshotPunchHoleOnReclaim = punchHoleOnReclaim, + }, LimboLogs.Instance); + + // Two reservations in one shared arena file: disposing the first leaves the file + // alive (the second keeps DeadBytes < Frontier), so cleanup actually punches. + (SnapshotLocation locA, ArenaReservation reservationA) = WriteReservation(manager, 64 * pageSize); + (SnapshotLocation locB, ArenaReservation reservationB) = WriteReservation(manager, pageSize); + Assert.That(locA.ArenaId, Is.EqualTo(locB.ArenaId), "both writes must pack into the same shared arena file"); + + string arenaPath = Directory.GetFiles(arenaDir).Single(); + Fsync(arenaPath); + long blocksBefore = StatBlocks(arenaPath); + Assert.That(blocksBefore, Is.GreaterThan(0), "the written reservations should occupy real disk blocks"); + + reservationA.Dispose(); + + if (punchHoleOnReclaim && !manager.PunchHoleSupported) + Assert.Ignore("filesystem does not support fallocate punch-hole"); + + long blocksAfter = StatBlocks(arenaPath); + if (punchHoleOnReclaim) + Assert.That(blocksAfter, Is.LessThan(blocksBefore), "cleanup should punch-hole reservation A's dead range"); + else + Assert.That(blocksAfter, Is.EqualTo(blocksBefore), "punch-hole is disabled"); + + reservationB.Dispose(); + } + + [Test] + public void BlobFrontierReset_TruncatesFile_ForOrphanedRange() + { + const int rlpSize = 4096; + const int rlpCount = 64; + string blobDir = Path.Combine(_testDir, "blob"); + + using BlobArenaManager blobs = new(blobDir, 8L * 1024 * 1024); + + ushort blobId; + using (BlobArenaWriter writer = blobs.CreateWriter(rlpSize * rlpCount)) + { + byte[] rlp = new byte[rlpSize]; + for (int i = 0; i < rlpCount; i++) + { + Random.Shared.NextBytes(rlp); + writer.WriteRlp(rlp); + } + writer.Complete(); + blobId = writer.BlobArenaId; + } + + string blobPath = Directory.GetFiles(blobDir).Single(); + long lengthBefore = new FileInfo(blobPath).Length; + Assert.That(lengthBefore, Is.GreaterThan(0), "the writer's appends should have grown the file"); + + // The writer's lease is gone, so the file is orphaned — frontier reset recycles it + // by truncating the file back to length 0 (frees disk blocks + zeros logical length + // in one syscall, eliminating the sparse-tail mismatch the old punch-hole path left). + BlobArenaFile file = blobs.GetFile(blobId); + blobs.TryResetOrphanedFrontier(file); + + Assert.That(file.Frontier, Is.EqualTo(0), "in-memory frontier reset"); + Assert.That(new FileInfo(blobPath).Length, Is.EqualTo(0), "on-disk file truncated by frontier reset"); + } + + private static (SnapshotLocation, ArenaReservation) WriteReservation(ArenaManager manager, int size) + { + using ArenaWriter writer = manager.CreateWriter(size); + ref ArenaBufferWriter buf = ref writer.GetWriter(); + int remaining = size; + while (remaining > 0) + { + int chunk = Math.Min(remaining, 64 * 1024); + Random.Shared.NextBytes(buf.GetSpan(chunk)[..chunk]); + buf.Advance(chunk); + remaining -= chunk; + } + return writer.Complete(); + } + + // Force the OS page cache to disk so st_blocks reflects the written data before the + // punch — ext4 delayed allocation otherwise leaves freshly-written blocks uncounted. + private static void Fsync(string path) + { + using FileStream fs = new(path, FileMode.Open, FileAccess.ReadWrite, FileShare.ReadWrite); + fs.Flush(flushToDisk: true); + } + + // .NET exposes no st_blocks accessor; shell out to coreutils stat (512-byte block count). + private static long StatBlocks(string path) + { + ProcessStartInfo psi = new() { FileName = "stat", RedirectStandardOutput = true, UseShellExecute = false }; + psi.ArgumentList.Add("-c"); + psi.ArgumentList.Add("%b"); + psi.ArgumentList.Add(path); + using Process proc = Process.Start(psi)!; + string output = proc.StandardOutput.ReadToEnd().Trim(); + proc.WaitForExit(); + return long.Parse(output); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/CompactionScheduleTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/CompactionScheduleTests.cs index 5865dde0b6cc..1e5d04c26cba 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/CompactionScheduleTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/CompactionScheduleTests.cs @@ -201,4 +201,55 @@ public void NextFullCompactionAfter_CompactSizeDisabled_ReturnsLongMaxValue() public void Constructor_NonPowerOf2CompactSize_Throws() => Assert.Throws(() => new CompactionSchedule(new MemDb(), new FlatDbConfig { CompactSize = 10 }, LimboLogs.Instance)); + + [TestCase(0, 0, 8192, false)] // block 0 → size 1 + [TestCase(0, 16, 8192, false)] // exactly CompactSize — not "large" + [TestCase(0, 8, 8192, false)] // intermediate (< CompactSize) + [TestCase(0, 32, 8192, true)] // 2× CompactSize + [TestCase(0, 64, 8192, true)] // 4× + [TestCase(3, 13, 8192, false)] // (13+3) = 16, exactly CompactSize + [TestCase(3, 16, 8192, false)] // (16+3) = 19, alignment 1 + [TestCase(3, 29, 8192, true)] // (29+3) = 32, > CompactSize + [TestCase(0, 32, 16, false)] // max == CompactSize: alignment 32 capped to 16 → not large + public void IsLargeCompactionBoundary_TrueWhenWindowExceedsCompactSize(int offset, long blockNumber, int maxCompactSize, bool expected) + { + FlatDbConfig config = new() { CompactSize = 16, PersistedSnapshotMaxCompactSize = maxCompactSize }; + CompactionSchedule schedule = ScheduleHelper.CreateWithOffset(config, offset); + + Assert.That(schedule.IsLargeCompactionBoundary(blockNumber), Is.EqualTo(expected)); + } + + [TestCase(0, 0, 8192, 1L)] // block 0 → 1 + [TestCase(0, 16, 8192, 16L)] // natural CompactSize boundary + [TestCase(0, 32, 8192, 32L)] // tier above CompactSize, below cap + [TestCase(0, 48, 8192, 16L)] // 48 & -48 = 16 + [TestCase(0, 64, 8192, 64L)] // 4×, below cap + [TestCase(3, 13, 8192, 16L)] // shifted: (13+3) & -(13+3) = 16 + [TestCase(3, 29, 8192, 32L)] // shifted: 32 (above CompactSize=16) + [TestCase(0, 64, 32, 32L)] // raw alignment 64 capped at PersistedSnapshotMaxCompactSize=32 + [TestCase(0, 128, 32, 32L)] // raw alignment 128 capped at 32 + public void GetPersistedSnapshotCompactSize_CappedAndOffsetAware(int offset, long blockNumber, int maxCompactSize, long expected) + { + FlatDbConfig config = new() { CompactSize = 16, PersistedSnapshotMaxCompactSize = maxCompactSize }; + CompactionSchedule schedule = ScheduleHelper.CreateWithOffset(config, offset); + + Assert.That(schedule.GetPersistedSnapshotCompactSize(blockNumber), Is.EqualTo(expected)); + } + + [TestCase(0, 0, 8192, false)] // block 0 → size 1 + [TestCase(0, 16, 8192, true)] // exactly CompactSize + [TestCase(0, 48, 8192, true)] // 48 & -48 = 16 + [TestCase(0, 8, 8192, false)] // intermediate (< CompactSize) + [TestCase(0, 32, 8192, false)] // large (> CompactSize) + [TestCase(0, 64, 8192, false)] // large + [TestCase(3, 13, 8192, true)] // shifted: (13+3) = 16 + [TestCase(3, 29, 8192, false)] // shifted large: 32 + [TestCase(0, 32, 16, true)] // max == CompactSize: alignment 32 capped to 16, exactly equals CompactSize + public void IsCompactSizeBoundary_TrueOnlyWhenWindowEqualsCompactSize(int offset, long blockNumber, int maxCompactSize, bool expected) + { + FlatDbConfig config = new() { CompactSize = 16, PersistedSnapshotMaxCompactSize = maxCompactSize }; + CompactionSchedule schedule = ScheduleHelper.CreateWithOffset(config, offset); + + Assert.That(schedule.IsCompactSizeBoundary(blockNumber), Is.EqualTo(expected)); + } } diff --git a/src/Nethermind/Nethermind.State.Flat.Test/FlatDbManagerPersistedTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/FlatDbManagerPersistedTests.cs new file mode 100644 index 000000000000..23bf86b62afe --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/FlatDbManagerPersistedTests.cs @@ -0,0 +1,147 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Nethermind.Config; +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Core.Test.Builders; +using Nethermind.Db; +using Nethermind.Logging; +using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.Trie; +using NSubstitute; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +[TestFixture] +public class FlatDbManagerPersistedTests +{ + private string _testDir = null!; + private ResourcePool _pool = null!; + private IProcessExitSource _processExitSource = null!; + private CancellationTokenSource _cts = null!; + private IFlatDbConfig _config = null!; + + [SetUp] + public void SetUp() + { + _testDir = Path.Combine(Path.GetTempPath(), $"nethermind_test_{Guid.NewGuid():N}"); + Directory.CreateDirectory(_testDir); + _pool = new ResourcePool(new FlatDbConfig()); + _cts = new CancellationTokenSource(); + _processExitSource = Substitute.For(); + _processExitSource.Token.Returns(_cts.Token); + _config = new FlatDbConfig { CompactSize = 16, MaxInFlightCompactJob = 4, InlineCompaction = true }; + } + + [TearDown] + public void TearDown() + { + _cts.Cancel(); + _cts.Dispose(); + if (Directory.Exists(_testDir)) + Directory.Delete(_testDir, recursive: true); + } + + [Test] + public async Task ConstructorAcceptsPersistedRepository() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + await using FlatDbManager manager = new( + Substitute.For(), + _processExitSource, + Substitute.For(), + Substitute.For(), + repo, + Substitute.For(), + Substitute.For(), + _config, + new BlocksConfig(), + LimboLogs.Instance, + enableDetailedMetrics: false); + + Assert.That(manager, Is.Not.Null); + } + + [Test] + public async Task GatherReadOnlySnapshotBundle_IncludesPersistedSnapshots() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + + TreePath path = new(Keccak.Compute("path"), 4); + byte[] nodeRlp = [0xC2, 0x80, 0x80]; + SnapshotContent content = new(); + content.StateNodes[path] = new TrieNode(NodeType.Leaf, nodeRlp); + Snapshot snap = new(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing); + + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + tier.ConvertToPersistedBase(snap).Dispose(); + + // Persisted snapshot covers s0→s1; mock reader anchored at s0 so the manager sees it as the persisted base. + IPersistenceManager persistenceManager = Substitute.For(); + IPersistence.IPersistenceReader reader = Substitute.For(); + reader.CurrentState.Returns(s0); + persistenceManager.LeaseReader().Returns(reader); + persistenceManager.GetCurrentPersistedStateId().Returns(s0); + + await using FlatDbManager manager = new( + Substitute.For(), + _processExitSource, + Substitute.For(), + Substitute.For(), + repo, + persistenceManager, + Substitute.For(), + _config, + new BlocksConfig(), + LimboLogs.Instance, + enableDetailedMetrics: false); + + ReadOnlySnapshotBundle bundle = manager.GatherReadOnlySnapshotBundle(s1); + + byte[]? result = bundle.TryLoadStateRlp(path, Keccak.Compute("hash"), ReadFlags.None); + Assert.That(result, Is.EqualTo(nodeRlp)); + + bundle.Dispose(); + } + + [Test] + public async Task DisposeAsync_DisposesPersistedRepository() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + SnapshotContent content = new(); + content.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(1).TestObject; + tier.ConvertToPersistedBase(new Snapshot(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + + FlatDbManager manager = new( + Substitute.For(), + _processExitSource, + Substitute.For(), + Substitute.For(), + repo, + Substitute.For(), + Substitute.For(), + _config, + new BlocksConfig(), + LimboLogs.Instance, + enableDetailedMetrics: false); + + await manager.DisposeAsync(); + + Assert.Pass("Dispose completed without error"); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/FlatDbManagerTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/FlatDbManagerTests.cs index f780fd1c5803..3ba745b3e3a5 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/FlatDbManagerTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/FlatDbManagerTests.cs @@ -9,6 +9,7 @@ using Nethermind.Db; using Nethermind.Logging; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using NSubstitute; using NUnit.Framework; @@ -23,6 +24,7 @@ public class FlatDbManagerTests private ISnapshotCompactor _snapshotCompactor = null!; private ISnapshotRepository _snapshotRepository = null!; private IPersistenceManager _persistenceManager = null!; + private IPersistedSnapshotLoader _persistedSnapshotLoader = null!; private IFlatDbConfig _config = null!; private IBlocksConfig _blocksConfig = null!; private CancellationTokenSource _cts = null!; @@ -38,6 +40,7 @@ public void SetUp() _snapshotCompactor = Substitute.For(); _snapshotRepository = Substitute.For(); _persistenceManager = Substitute.For(); + _persistedSnapshotLoader = Substitute.For(); _config = new FlatDbConfig { CompactSize = 16, MaxInFlightCompactJob = 4, InlineCompaction = true }; _blocksConfig = Substitute.For(); _blocksConfig.SecondsPerSlot.Returns(12UL); @@ -46,6 +49,7 @@ public void SetUp() [TearDown] public void TearDown() { + _persistedSnapshotLoader.Dispose(); _cts.Cancel(); _cts.Dispose(); } @@ -57,6 +61,7 @@ public void TearDown() _snapshotCompactor, _snapshotRepository, _persistenceManager, + _persistedSnapshotLoader, _config, _blocksConfig, LimboLogs.Instance, @@ -123,7 +128,7 @@ public async Task AddSnapshot_BlockBelowPersistedState_ReturnsEarlyAndLogsWarnin await using FlatDbManager manager = CreateManager(); manager.AddSnapshot(snapshot, transientResource); - _snapshotRepository.DidNotReceive().TryAddSnapshot(Arg.Any()); + _snapshotRepository.DidNotReceive().TryAdd(Arg.Any(), SnapshotTier.InMemoryBase); _snapshotRepository.DidNotReceive().SetLastCommittedStateId(Arg.Any()); } @@ -132,7 +137,7 @@ public async Task AddSnapshot_ValidSnapshot_AddsToRepository() { StateId persistedStateId = CreateStateId(5); _persistenceManager.GetCurrentPersistedStateId().Returns(persistedStateId); - _snapshotRepository.TryAddSnapshot(Arg.Any()).Returns(true); + _snapshotRepository.TryAdd(Arg.Any(), SnapshotTier.InMemoryBase).Returns(true); ResourcePool realResourcePool = new(_config); StateId snapshotFrom = CreateStateId(10); @@ -143,7 +148,7 @@ public async Task AddSnapshot_ValidSnapshot_AddsToRepository() await using FlatDbManager manager = CreateManager(); manager.AddSnapshot(snapshot, transientResource); - _snapshotRepository.Received(1).TryAddSnapshot(snapshot); + _snapshotRepository.Received(1).TryAdd(snapshot, SnapshotTier.InMemoryBase); _snapshotRepository.Received(1).SetLastCommittedStateId(snapshotTo); } @@ -157,7 +162,7 @@ public async Task GatherReadOnlySnapshotBundle_CacheClearedPeriodically() _persistenceManager.LeaseReader().Returns(mockReader); _snapshotRepository.AssembleSnapshots(stateId, stateId, Arg.Any()) - .Returns(new SnapshotPooledList(0)); + .Returns(new AssembledSnapshotResult(new SnapshotPooledList(0), PersistedSnapshotList.Empty())); await using FlatDbManager manager = CreateManager(); @@ -183,7 +188,7 @@ public async Task AddSnapshot_DuplicateSnapshot_DisposesSnapshotAndReturnsResour { StateId persistedStateId = CreateStateId(5); _persistenceManager.GetCurrentPersistedStateId().Returns(persistedStateId); - _snapshotRepository.TryAddSnapshot(Arg.Any()).Returns(false); + _snapshotRepository.TryAdd(Arg.Any(), SnapshotTier.InMemoryBase).Returns(false); ResourcePool realResourcePool = new(_config); StateId snapshotFrom = CreateStateId(10); diff --git a/src/Nethermind/Nethermind.State.Flat.Test/FlatOverridableWorldScopeTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/FlatOverridableWorldScopeTests.cs index 88644a104556..433dd9cd99d1 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/FlatOverridableWorldScopeTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/FlatOverridableWorldScopeTests.cs @@ -16,6 +16,7 @@ using Nethermind.Int256; using Nethermind.Logging; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.State.Flat.ScopeProvider; using NSubstitute; using NUnit.Framework; @@ -60,7 +61,7 @@ public TestContext(FlatDbConfig? config = null) .Returns(_ => { SnapshotPooledList snapshotList = new(0); - return new ReadOnlySnapshotBundle(snapshotList, Substitute.For(), false); + return new ReadOnlySnapshotBundle(snapshotList, Substitute.For(), false, PersistedSnapshotStack.Empty()); }); flatDbManager.HasStateForBlock(Arg.Any()) diff --git a/src/Nethermind/Nethermind.State.Flat.Test/FlatPersistenceTestExtensions.cs b/src/Nethermind/Nethermind.State.Flat.Test/FlatPersistenceTestExtensions.cs new file mode 100644 index 000000000000..40cc28dca063 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/FlatPersistenceTestExtensions.cs @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core.Crypto; +using Nethermind.State.Flat.Persistence; + +namespace Nethermind.State.Flat.Test; + +/// +/// Test-only convenience overloads for that iterate +/// the full key range. Production callers always pass explicit bounds, so these whole-range +/// forwarders live with the tests rather than on the production interface. +/// +internal static class FlatPersistenceTestExtensions +{ + public static IPersistence.IFlatIterator CreateAccountIterator(this IPersistence.IPersistenceReader reader) + => reader.CreateAccountIterator(ValueKeccak.Zero, ValueKeccak.MaxValue); + + public static IPersistence.IFlatIterator CreateStorageIterator(this IPersistence.IPersistenceReader reader, in ValueHash256 accountKey) + => reader.CreateStorageIterator(accountKey, ValueKeccak.Zero, ValueKeccak.MaxValue); +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/FlatTestContainer.cs b/src/Nethermind/Nethermind.State.Flat.Test/FlatTestContainer.cs new file mode 100644 index 000000000000..68e81e884a08 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/FlatTestContainer.cs @@ -0,0 +1,134 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.IO; +using System.Threading; +using Autofac; +using Nethermind.Api; +using Nethermind.Config; +using Nethermind.Core; +using Nethermind.Core.Test.IO; +using Nethermind.Db; +using Nethermind.Init.Modules; +using Nethermind.Logging; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using NSubstitute; + +namespace Nethermind.State.Flat.Test; + +/// +/// Builds the persisted-tier flatdb component graph the way production does — by loading +/// into an Autofac container — then overlays the handful of +/// test-only overrides every fixture needs: a temp BaseDbPath, in-memory catalog/metadata +/// s, , a cancellable , and a +/// blob arena sized independently of the trie-RLP arena. Resolving any persisted-tier component returns +/// the same singletons the production module wires, so tests run against a prod-representative graph. +/// +/// +/// The container builds lazily on first resolve; building runs , +/// and disposal tears down the loader before the temp dir is removed. Reopen/restart tests build a second +/// over the same and the same +/// instance to verify data survives a restart. +/// +internal sealed class FlatTestContainer : IDisposable +{ + private readonly ContainerBuilder _builder; + private readonly CancellationTokenSource _cts = new(); + private readonly TempPath? _ownedTempDir; + private IContainer? _container; + + public FlatDbConfig Config { get; } + + /// Data directory the persisted tier lives under; pass it to a second container to reopen. + public string BaseDbPath { get; } + + /// The in-memory catalog; pass it to a second container to simulate a restart. + public IDb CatalogDb { get; } + + public FlatTestContainer( + FlatDbConfig? config = null, + long arenaFileSizeBytes = 1024L * 1024 * 1024, + long blobFileSizeBytes = 1024L * 1024, + string? baseDbPath = null, + IDb? catalogDb = null, + Action? configure = null) + { + Config = config ?? new FlatDbConfig(); + Config.ArenaFileSizeBytes = arenaFileSizeBytes; + + if (baseDbPath is null) + { + _ownedTempDir = TempPath.GetTempDirectory(); + BaseDbPath = _ownedTempDir.Path; + } + else + { + BaseDbPath = baseDbPath; + } + + CatalogDb = catalogDb ?? new MemDb(); + + IProcessExitSource processExitSource = Substitute.For(); + processExitSource.Token.Returns(_cts.Token); + + _builder = new ContainerBuilder() + .AddModule(new FlatWorldStateModule(Config)) + .AddSingleton(Config) + .AddSingleton(LimboLogs.Instance) + .AddSingleton(new InitConfig { BaseDbPath = BaseDbPath }) + .AddSingleton(processExitSource) + // The production module wires the catalog and metadata to columned RocksDB via IDbFactory, + // which the test project does not provide; an in-memory db is behavior-equivalent here. + .AddKeyedSingleton(DbNames.PersistedSnapshotCatalog, CatalogDb) + .AddKeyedSingleton(DbNames.Metadata, new MemDb()) + // The module sizes the blob arena off ArenaFileSizeBytes (shared with the trie-RLP arena); + // tests size the two independently, so override the blob arena's file size. + .AddSingleton(initConfig => + new BlobArenaManager(Path.Combine(initConfig.BaseDbPath, "persisted_snapshot", "blob"), blobFileSizeBytes)) + // Config defaults to EnableLongFinality=false, which makes the module swap in the Null + // catalog/loader. These fixtures exercise the real persisted tier, so force the real catalog + // back (last-registration wins); the real loader is reached via concrete resolves below. + .AddSingleton(ctx => ctx.Resolve()); + + configure?.Invoke(_builder); + } + + private IContainer Container => _container ??= BuildAndLoad(); + + private IContainer BuildAndLoad() + { + IContainer container = _builder.Build(); + container.Resolve().Load(); + return container; + } + + public T Resolve() where T : notnull => Container.Resolve(); + + public SnapshotRepository Repository => Resolve(); + public IPersistedSnapshotLoader Loader => Resolve(); + public ResourcePool ResourcePool => Resolve(); + public ArenaManager Arena => Resolve(); + public BlobArenaManager Blobs => Resolve(); + public PersistedSnapshotCompactor Compactor => Resolve(); + + /// Converts to a persisted base via the production loader and + /// returns it pre-leased from the repository so callers hold a disposable handle for assertions. + public PersistedSnapshot ConvertToPersistedBase(Snapshot snapshot) + { + Loader.ConvertAndRegister(snapshot); + using PersistedSnapshotList bases = Repository.LeaseBaseSnapshotsInRange(snapshot.From, snapshot.To); + PersistedSnapshot persisted = bases[0]; + _ = persisted.TryAcquire(); + return persisted; + } + + public void Dispose() + { + _cts.Cancel(); + _container?.Dispose(); + _cts.Dispose(); + _ownedTempDir?.Dispose(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/FlatTestHelpers.cs b/src/Nethermind/Nethermind.State.Flat.Test/FlatTestHelpers.cs index bc1b2f3ae33a..54070df528e5 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/FlatTestHelpers.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/FlatTestHelpers.cs @@ -7,6 +7,7 @@ using Nethermind.Core.Crypto; using Nethermind.Int256; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.Trie; using NSubstitute; @@ -33,7 +34,8 @@ public static SnapshotPooledList SnapshotList(params Snapshot[] snapshots) /// optionally pre-populating the snapshot content via . /// public static ReadOnlySnapshotBundle MakeBundle(ResourcePool pool, Action? populate = null) => - new(SnapshotList(MakeSnapshot(pool, populate)), Substitute.For(), recordDetailedMetrics: false); + new(SnapshotList(MakeSnapshot(pool, populate)), Substitute.For(), + recordDetailedMetrics: false, PersistedSnapshotStack.Empty()); } /// diff --git a/src/Nethermind/Nethermind.State.Flat.Test/FlatWorldStateScopeProviderTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/FlatWorldStateScopeProviderTests.cs index bac8aa3d3c5b..269ccce10152 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/FlatWorldStateScopeProviderTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/FlatWorldStateScopeProviderTests.cs @@ -5,6 +5,7 @@ using System.Threading; using System.Threading.Tasks; using Autofac; +using Nethermind.Api; using Nethermind.Config; using Nethermind.Core; using Nethermind.Core.Crypto; @@ -16,6 +17,7 @@ using Nethermind.Int256; using Nethermind.Logging; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.State.Flat.ScopeProvider; using Nethermind.Trie; using Nethermind.Trie.Pruning; @@ -78,12 +80,14 @@ public TestContext(FlatDbConfig? config = null) .AddSingleton(LimboLogs.Instance) .AddSingleton(config) .AddSingleton(_ => new TrieStoreScopeProvider.KeyValueWithBatchingBackedCodeDb(new TestMemDb())) + .AddSingleton(_ => Substitute.For()) ; // Externally owned because snapshot bundle take ownership _containerBuilder.RegisterType() .WithParameter(TypedParameter.From(false)) // recordDetailedMetrics .WithParameter(TypedParameter.From(ReadOnlySnapshots)) + .WithParameter(TypedParameter.From(PersistedSnapshotStack.Empty())) .ExternallyOwned(); ConfigureSnapshotBundle(); diff --git a/src/Nethermind/Nethermind.State.Flat.Test/Io/PooledByteBufferWriterTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/Io/PooledByteBufferWriterTests.cs new file mode 100644 index 000000000000..b233b4e81e5f --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/Io/PooledByteBufferWriterTests.cs @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.State.Flat.Io; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test.Io; + +[TestFixture] +public class PooledByteBufferWriterTests +{ + [TestCase(1)] + [TestCase(5000)] + public void ZeroCapacity_GrowsToFitFirstWrite(int size) + { + using PooledByteBufferWriter pooled = new(initialCapacity: 0); + ref PooledByteBufferWriter.Writer w = ref pooled.GetWriter(); + + System.Span span = w.GetSpan(size); + for (int i = 0; i < size; i++) span[i] = (byte)(i & 0xff); + w.Advance(size); + + System.ReadOnlySpan written = pooled.WrittenSpan; + Assert.That(written.Length, Is.EqualTo(size)); + for (int i = 0; i < size; i++) Assert.That(written[i], Is.EqualTo((byte)(i & 0xff))); + } + + // Exercises the Buffer.MemoryCopy branch inside Grow (_written > 0). + [Test] + public void Grow_PreservesExistingContentAcrossMultipleGrows() + { + using PooledByteBufferWriter pooled = new(initialCapacity: 4); + ref PooledByteBufferWriter.Writer w = ref pooled.GetWriter(); + + for (int chunk = 0; chunk < 6; chunk++) + { + const int len = 100; + System.Span span = w.GetSpan(len); + for (int i = 0; i < len; i++) span[i] = (byte)((chunk * 100 + i) & 0xff); + w.Advance(len); + } + + System.ReadOnlySpan written = pooled.WrittenSpan; + Assert.That(written.Length, Is.EqualTo(600)); + for (int j = 0; j < 600; j++) Assert.That(written[j], Is.EqualTo((byte)(j & 0xff))); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/LongFinalityIntegrationTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/LongFinalityIntegrationTests.cs new file mode 100644 index 000000000000..e04d9a1b1016 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/LongFinalityIntegrationTests.cs @@ -0,0 +1,414 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Autofac; +using Nethermind.Config; +using Nethermind.Init.Modules; +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Core.Test.Builders; +using Nethermind.Db; +using Nethermind.Int256; +using Nethermind.Logging; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.Persistence; +using Nethermind.Serialization.Rlp; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.Trie; +using NSubstitute; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +[TestFixture] +public class LongFinalityIntegrationTests +{ + private string _testDir = null!; + private ResourcePool _pool = null!; + private IProcessExitSource _processExitSource = null!; + private CancellationTokenSource _cts = null!; + private IFlatDbConfig _config = null!; + private ArenaManager _memArena = null!; + private BlobArenaManager _helperBlobs = null!; + + [SetUp] + public void SetUp() + { + _testDir = Path.Combine(Path.GetTempPath(), $"nethermind_test_{Guid.NewGuid():N}"); + Directory.CreateDirectory(_testDir); + _pool = new ResourcePool(new FlatDbConfig()); + _cts = new CancellationTokenSource(); + _processExitSource = Substitute.For(); + _processExitSource.Token.Returns(_cts.Token); + _config = new FlatDbConfig { CompactSize = 16, MaxInFlightCompactJob = 4, InlineCompaction = true }; + _memArena = TestFixtureHelpers.CreateArenaManager(Path.Combine(_testDir, "mem-arena")); + _helperBlobs = new BlobArenaManager(Path.Combine(_testDir, "helper-blobs"), 4L * 1024 * 1024); + } + + [TearDown] + public void TearDown() + { + _cts.Cancel(); + _cts.Dispose(); + _helperBlobs.Dispose(); + _memArena.Dispose(); + if (Directory.Exists(_testDir)) + Directory.Delete(_testDir, recursive: true); + } + + private Snapshot CreateSnapshot(StateId from, StateId to, Action configure) + { + SnapshotContent content = new(); + configure(content); + return new Snapshot(from, to, content, _pool, ResourcePool.Usage.MainBlockProcessing); + } + + private PersistedSnapshot CreatePersistedSnapshot(StateId from, StateId to, byte[] data) => + TestFixtureHelpers.CreatePersistedSnapshot(_memArena, _helperBlobs, from, to, data); + + [Test] + public void FullStack_PersistAndQuery_AccountsStorageAndTrieNodes() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + + TreePath statePath = new(Keccak.Compute("state_path"), 4); + Hash256 storageAddr = Keccak.Compute("storage_address"); + TreePath storagePath = new(Keccak.Compute("storage_path"), 6); + byte[] stateRlp = [0xC2, 0x80, 0x80]; + byte[] storageRlp = [0xC1, 0x80]; + + Snapshot snap = CreateSnapshot(s0, s1, c => + { + c.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(500).TestObject; + byte[] slotVal = new byte[32]; slotVal[31] = 0xFF; + c.Storages[(TestItem.AddressA, (UInt256)42)] = new SlotValue(slotVal); + c.SelfDestructedStorageAddresses[TestItem.AddressB] = false; + c.StateNodes[statePath] = new TrieNode(NodeType.Leaf, stateRlp); + c.StorageNodes[(storageAddr, storagePath)] = new TrieNode(NodeType.Branch, storageRlp); + }); + + tier.ConvertToPersistedBase(snap).Dispose(); + Assert.That(repo.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? persisted), Is.True); + + Assert.That(persisted!.TryLoadStateNodeRlp(statePath, out byte[]? stateResult), Is.True); + Assert.That(stateResult, Is.EqualTo(stateRlp)); + Assert.That(persisted.TryLoadStorageNodeRlp(storageAddr.ValueHash256, storagePath, out byte[]? storageResult), Is.True); + Assert.That(storageResult, Is.EqualTo(storageRlp)); + persisted.Dispose(); + } + + // 4 KiB — each snapshot's metadata reservation page-rounds to fill the whole arena + // file, so the file fully-dies on the sole reservation's MarkDead and the punch path + // is short-circuited. 1 MiB — both snapshots' reservations pack into one arena file, + // so snap1's dispose finds snap2 still live, MarkDead returns true, and the bare + // ArenaReservation.CleanUp would (without the PersistOnShutdown-aware fix) punch the + // dead range in a live preserve-flagged file, zeroing snap1's metadata for session 2. + [TestCase(4096L, TestName = "Repository_Restart_PreservesAllData_PerSnapshotArenaFiles")] + [TestCase(1L * 1024 * 1024, TestName = "Repository_Restart_PreservesAllData_SharedArenaAcrossSnapshots")] + public void Repository_Restart_PreservesAllData(long maxArenaSize) + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + + // Per-snapshot trie nodes are capped at 568 bytes (MaxTrieNodeRlpBytes), so use + // many smaller RLPs per snapshot to push the cumulative blob frontier well past + // 1 OS page (4 KiB). Without enough total blob bytes, a stray + // BlobArenaManager.TryResetOrphanedFrontier punch over [0, frontier) is a no-op + // on tmpfs (sub-page punches are dropped), letting the test silently pass with + // the bug present. 10 × ~500 bytes per snap = ~5 KiB per snap = ~10 KiB shared + // blob frontier → punch reliably zeros page 0. + const int nodesPerSnap = 10; + byte[] body1 = new byte[500]; Array.Fill(body1, (byte)0xAA); + byte[] body2 = new byte[500]; Array.Fill(body2, (byte)0xBB); + byte[] rlp1 = Rlp.Encode(body1).Bytes; // ~503 bytes — under MaxTrieNodeRlpBytes + byte[] rlp2 = Rlp.Encode(body2).Bytes; + TreePath[] paths1 = new TreePath[nodesPerSnap]; + TreePath[] paths2 = new TreePath[nodesPerSnap]; + for (int i = 0; i < nodesPerSnap; i++) + { + paths1[i] = new TreePath(Keccak.Compute($"path1_{i}"), 4); + paths2[i] = new TreePath(Keccak.Compute($"path2_{i}"), 4); + } + MemDb catalogDb = new(); + + using (FlatTestContainer tier1 = new(arenaFileSizeBytes: maxArenaSize, baseDbPath: _testDir, catalogDb: catalogDb)) + { + SnapshotRepository repo = tier1.Repository; + + tier1.ConvertToPersistedBase(CreateSnapshot(s0, s1, c => + { + foreach (TreePath p in paths1) c.StateNodes[p] = new TrieNode(NodeType.Leaf, rlp1); + c.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(100).TestObject; + })).Dispose(); + + tier1.ConvertToPersistedBase(CreateSnapshot(s1, s2, c => + { + foreach (TreePath p in paths2) c.StateNodes[p] = new TrieNode(NodeType.Leaf, rlp2); + c.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(200).TestObject; + })).Dispose(); + } + + // Repository.Dispose flags every loaded snapshot's arena reservation AND every + // referenced blob file with PersistOnShutdown before tearing down the managers, + // so both file kinds must survive on disk for the catalog to re-bind in session 2. + // Split assertions so a missing flag on one side fingerprints which side regressed. + string arenaDir = Path.Combine(_testDir, "persisted_snapshot", "arena"); + string blobDir = Path.Combine(_testDir, "persisted_snapshot", "blob"); + // PersistedBase metadata lives in the small-arena pool (sub-CompactSize tier). + Assert.That(Directory.GetFiles(arenaDir, "small_arena_*.bin"), Is.Not.Empty, + "arena files were deleted on Dispose — PersistOnShutdown flag did not propagate to ArenaFile"); + string[] blobFiles = Directory.GetFiles(blobDir, "blob_*.bin"); + Assert.That(blobFiles, Is.Not.Empty, + "blob files were deleted on Dispose — PersistOnShutdown flag did not propagate to BlobArenaFile"); + // No pre-extension: blob length tracks the actual data extent. If we ever drift + // back into pre-extending or punch-zero-on-shutdown, a preserve-flagged file ends + // up with length 0 (truncated) or length MaxSize (pre-extended sparse) — neither + // matches the snapshot's written extent. Either symptom would be caught here. + foreach (string blobFile in blobFiles) + { + long len = new FileInfo(blobFile).Length; + Assert.That(len, Is.GreaterThan(0), + $"{blobFile} truncated on Dispose — preserve flag did not protect a referenced blob"); + Assert.That(len, Is.LessThanOrEqualTo(1024 * 1024), + $"{blobFile} length {len} > 1 MiB cap — pre-extension regressed"); + } + + using (FlatTestContainer tier2 = new(arenaFileSizeBytes: 4096, baseDbPath: _testDir, catalogDb: catalogDb)) + { + SnapshotRepository repo = tier2.Repository; + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(2)); + + // s0→s1 carries paths1[] + AddressA; s1→s2 carries paths2[] + AddressB. Every + // state node round-trips intact — a stray BlobArenaManager.TryResetOrphanedFrontier + // punch during the session-1 dispose would zero at least the first 4 KiB of the + // blob, so the early-index nodes' RLPs would either not decode or read as zeros. + // The cross-snapshot misses verify the snapshot boundary survives reload (i.e. + // AddressB does NOT bleed into snap1's view, and vice versa). + Assert.That(repo.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? snap1), Is.True); + foreach (TreePath p in paths1) + { + Assert.That(snap1!.TryLoadStateNodeRlp(p, out byte[]? r), Is.True, $"snap1 missing {p}"); + Assert.That(r, Is.EqualTo(rlp1), $"snap1 state node at {p} read back corrupted"); + } + Assert.That(snap1!.TryGetAccount(TestItem.AddressA, out Account? a1), Is.True); + Assert.That(snap1.TryGetAccount(TestItem.AddressB, out Account? snap1MissB), Is.False); + snap1.Dispose(); + + Assert.That(repo.TryLeasePersistedState(s2, SnapshotTier.PersistedBase, out PersistedSnapshot? snap2), Is.True); + foreach (TreePath p in paths2) + { + Assert.That(snap2!.TryLoadStateNodeRlp(p, out byte[]? r), Is.True, $"snap2 missing {p}"); + Assert.That(r, Is.EqualTo(rlp2), $"snap2 state node at {p} read back corrupted"); + } + Assert.That(snap2!.TryGetAccount(TestItem.AddressB, out Account? a2), Is.True); + Assert.That(snap2.TryGetAccount(TestItem.AddressA, out Account? snap2MissA), Is.False); + snap2.Dispose(); + + Assert.That(a1!.Balance, Is.EqualTo((UInt256)100)); + Assert.That(a2!.Balance, Is.EqualTo((UInt256)200)); + Assert.That(snap1MissB, Is.Null); + Assert.That(snap2MissA, Is.Null); + } + } + + + [Test] + public void MergeSnapshotData_AllEntryTypes() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + + TreePath statePath = new(Keccak.Compute("state"), 4); + Hash256 storageAddr = Keccak.Compute("addr"); + TreePath storagePath = new(Keccak.Compute("stor_path"), 6); + + Snapshot snap1 = CreateSnapshot(s0, s1, c => + { + c.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(100).TestObject; + c.StateNodes[statePath] = new TrieNode(NodeType.Leaf, [0xC0]); + c.StorageNodes[(storageAddr, storagePath)] = new TrieNode(NodeType.Branch, [0xC1, 0x80]); + }); + + Snapshot snap2 = CreateSnapshot(s1, s2, c => + { + c.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(200).TestObject; + c.StateNodes[statePath] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x80]); // Override + }); + + byte[] data1 = PersistedSnapshotBuilderTestExtensions.Build(snap1, _helperBlobs); + byte[] data2 = PersistedSnapshotBuilderTestExtensions.Build(snap2, _helperBlobs); + PersistedSnapshot baseSnap1 = CreatePersistedSnapshot(s0, s1, data1); + PersistedSnapshot baseSnap2 = CreatePersistedSnapshot(s1, s2, data2); + PersistedSnapshotList toMerge = new(2) { baseSnap1, baseSnap2 }; + byte[] merged = PersistedSnapshotBuilderTestExtensions.NWayMergeSnapshots(toMerge); + + PersistedSnapshot mergedSnap = CreatePersistedSnapshot(s0, s2, merged); + + // State node should have newer value + Assert.That(mergedSnap.TryLoadStateNodeRlp(statePath, out byte[]? stateRlpResult), Is.True); + Assert.That(stateRlpResult, Is.EqualTo(new byte[] { 0xC2, 0x80, 0x80 })); + + // Storage node from older should be preserved + Assert.That(mergedSnap.TryLoadStorageNodeRlp(storageAddr.ValueHash256, storagePath, out byte[]? storageRlpResult), Is.True); + Assert.That(storageRlpResult, Is.EqualTo(new byte[] { 0xC1, 0x80 })); + + Assert.That(mergedSnap.TryGetAccount(TestItem.AddressA, out _), Is.True); + Assert.That(mergedSnap.TryGetAccount(TestItem.AddressB, out _), Is.True); + } + + [TestCase(10)] + [TestCase(100)] + public void ManySnapshots_PersistAndQuery(int snapshotCount) + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 64 * 1024); + SnapshotRepository repo = tier.Repository; + + StateId prev = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= snapshotCount; i++) + { + StateId current = new(i, Keccak.Compute(i.ToString())); + tier.ConvertToPersistedBase(CreateSnapshot(prev, current, c => + c.Accounts[new Address(Keccak.Compute(i.ToString()))] = + Build.An.Account.WithBalance((UInt256)i).TestObject)).Dispose(); + prev = current; + } + + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(snapshotCount)); + } + + + [Test] + public async Task FlatDbManager_EndToEnd_WithPersistedSnapshots() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + TreePath path = new(Keccak.Compute("e2e_path"), 4); + byte[] nodeRlp = [0xC1, 0x80]; + + tier.ConvertToPersistedBase(CreateSnapshot(s0, s1, c => + c.StateNodes[path] = new TrieNode(NodeType.Leaf, nodeRlp))).Dispose(); + + // Set up persistence reader at s0 — persisted snapshot fills gap s0→s1 + IPersistenceManager persistenceManager = Substitute.For(); + IPersistence.IPersistenceReader reader = Substitute.For(); + reader.CurrentState.Returns(s0); + persistenceManager.LeaseReader().Returns(reader); + persistenceManager.GetCurrentPersistedStateId().Returns(s0); + + await using FlatDbManager manager = new( + Substitute.For(), + _processExitSource, + Substitute.For(), + Substitute.For(), + repo, + persistenceManager, + Substitute.For(), + _config, + new BlocksConfig(), + LimboLogs.Instance, + enableDetailedMetrics: false); + + ReadOnlySnapshotBundle bundle = manager.GatherReadOnlySnapshotBundle(s1); + + byte[]? result = bundle.TryLoadStateRlp(path, Keccak.Compute("hash"), ReadFlags.None); + Assert.That(result, Is.EqualTo(nodeRlp)); + + bundle.Dispose(); + } + + [Test] + public void Prune_AfterRestart_Works() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + StateId s5 = new(5, Keccak.Compute("5")); + MemDb catalogDb = new(); + + using (FlatTestContainer tier1 = new(arenaFileSizeBytes: 4096, baseDbPath: _testDir, catalogDb: catalogDb)) + { + SnapshotRepository repo = tier1.Repository; + tier1.ConvertToPersistedBase(CreateSnapshot(s0, s1, c => + c.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(1).TestObject)).Dispose(); + tier1.ConvertToPersistedBase(CreateSnapshot(s1, s2, c => + c.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(2).TestObject)).Dispose(); + tier1.ConvertToPersistedBase(CreateSnapshot(s2, s5, c => + c.Accounts[TestItem.AddressC] = Build.An.Account.WithBalance(5).TestObject)).Dispose(); + } + + using (FlatTestContainer tier2 = new(arenaFileSizeBytes: 4096, baseDbPath: _testDir, catalogDb: catalogDb)) + { + SnapshotRepository repo = tier2.Repository; + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(3)); + + repo.RemovePersistedStatesUntil(3); // s1 and s2 removed + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(1)); + } + + using (FlatTestContainer tier3 = new(arenaFileSizeBytes: 4096, baseDbPath: _testDir, catalogDb: catalogDb)) + { + SnapshotRepository repo = tier3.Repository; + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(1)); + } + } + + [Test] + public void EmptySnapshot_PersistsAndLoads() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + + Snapshot empty = CreateSnapshot(s0, s1, _ => { }); + tier.ConvertToPersistedBase(empty).Dispose(); + + Assert.That(repo.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? persisted), Is.True); + Assert.That(persisted!.TryGetAccount(TestItem.AddressA, out _), Is.False); + Assert.That(persisted.TryLoadStateNodeRlp(new TreePath(Keccak.Compute("any"), 4), out _), Is.False); + persisted.Dispose(); + } + + [Test] + public void Configuration_DefaultValues() + { + FlatDbConfig config = new(); + Assert.That(config.EnableLongFinality, Is.False); + Assert.That(config.MaxReorgDepth, Is.EqualTo(256)); + Assert.That(config.LongFinalityMaxReorgDepth, Is.EqualTo(90000)); + Assert.That(config.ArenaFileSizeBytes, Is.EqualTo(1L * 1024 * 1024 * 1024)); + } + + [Test] + public void DisabledLongFinality_WiresInertPersistedTier() + { + FlatDbConfig config = new() { EnableLongFinality = false }; + using IContainer container = new ContainerBuilder() + .AddModule(new FlatWorldStateModule(config)) + .AddSingleton(config) + .AddSingleton(LimboLogs.Instance) + .Build(); + + Assert.That(container.Resolve(), Is.SameAs(NullSnapshotCatalog.Instance)); + Assert.That(container.Resolve(), Is.SameAs(NullPersistedSnapshotLoader.Instance)); + Assert.That(container.Resolve(), Is.SameAs(NullPersistedSnapshotCompactor.Instance)); + + // The Null loader/catalog keep the tier inert: loading is a no-op and nothing is ever recorded. + container.Resolve().Load(); + Assert.That(container.Resolve().Load(), Is.Empty); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/Nethermind.State.Flat.Test.csproj b/src/Nethermind/Nethermind.State.Flat.Test/Nethermind.State.Flat.Test.csproj index a9ef96f63d55..8601141c49fe 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/Nethermind.State.Flat.Test.csproj +++ b/src/Nethermind/Nethermind.State.Flat.Test/Nethermind.State.Flat.Test.csproj @@ -5,6 +5,7 @@ Nethermind.State.Flat.Test enable + true diff --git a/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotBuilderTestExtensions.cs b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotBuilderTestExtensions.cs new file mode 100644 index 000000000000..81b5d33945f1 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotBuilderTestExtensions.cs @@ -0,0 +1,66 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using Nethermind.Core.Collections; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.PersistedSnapshots.Storage; + +namespace Nethermind.State.Flat.Test; + +/// +/// Allocates output buffers internally, which production code avoids. +/// +internal static class PersistedSnapshotBuilderTestExtensions +{ + /// + /// The caller must keep alive across the test fixture so that a + /// constructed from the returned bytes can lease the blob + /// file via the same manager — mirroring how production wires BlobArenaManager as + /// a long-lived shared component. + /// + public static byte[] Build(Snapshot snapshot, BlobArenaManager blobs) + { + int estimatedSize = checked((int)PersistedSnapshotBuilder.EstimateSize(snapshot)); + using PooledByteBufferWriter pooled = new(estimatedSize); + using BlobArenaWriter blobWriter = blobs.CreateWriter(estimatedSize); + using Nethermind.State.Flat.Persistence.BloomFilter.BloomFilter bloom = + Nethermind.State.Flat.Persistence.BloomFilter.BloomFilter.AlwaysTrue(); + PersistedSnapshotBuilder.Build( + snapshot, ref pooled.GetWriter(), blobWriter, bloom); + blobWriter.Complete(); + return pooled.WrittenSpan.ToArray(); + } + + public static byte[] NWayMergeSnapshots(PersistedSnapshotList snapshots) + { + if (snapshots.Count == 0) throw new ArgumentException("Cannot merge empty snapshot list"); + if (snapshots.Count == 1) + { + using WholeReadSession session = snapshots[0].BeginWholeReadSession(); + return TestFixtureHelpers.ReadAll(session); + } + + long totalSize = 0; + for (int i = 0; i < snapshots.Count; i++) totalSize += snapshots[i].Size; + totalSize += 4096; + + using PooledByteBufferWriter pooled = new(checked((int)totalSize)); + int n = snapshots.Count; + using ArrayPoolList sessionsList = new(n, n); + WholeReadSession[] sessionArr = sessionsList.UnsafeGetInternalArray(); + try + { + for (int i = 0; i < n; i++) + sessionArr[i] = snapshots[i].BeginWholeReadSession(); + PersistedSnapshotMerger.NWayMergeSnapshots( + sessionsList.AsSpan(), ref pooled.GetWriter(), bloom: Nethermind.State.Flat.Persistence.BloomFilter.BloomFilter.AlwaysTrue()); + } + finally + { + for (int i = 0; i < n; i++) sessionArr[i]?.Dispose(); + } + return pooled.WrittenSpan.ToArray(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotCompactorTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotCompactorTests.cs new file mode 100644 index 000000000000..e04ff456cec6 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotCompactorTests.cs @@ -0,0 +1,1462 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.Collections.Generic; +using System.IO; +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Core.Test.Builders; +using Nethermind.Int256; +using Nethermind.Db; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.Trie; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +[TestFixture] +public class PersistedSnapshotCompactorTests +{ + private ResourcePool _pool = null!; + private ArenaManager _memArena = null!; + private string _memArenaDir = null!; + + [SetUp] + public void SetUp() + { + _pool = new ResourcePool(new FlatDbConfig()); + _memArenaDir = Path.Combine(Path.GetTempPath(), $"nm-compactortest-arena-{Guid.NewGuid():N}"); + _memArena = TestFixtureHelpers.CreateArenaManager(_memArenaDir); + } + + [TearDown] + public void TearDown() + { + _memArena.Dispose(); + try { Directory.Delete(_memArenaDir, recursive: true); } catch { /* best-effort */ } + } + + /// + /// Regression for large-tier compactions where N approaches the typical + /// compactSize/CompactSize ceiling (~32). Each source carries a unique account + /// plus a shared overlapping account (AddressA) with a distinct slot per block, so the + /// per-address sub-tag merge runs with matchCount == N on every iteration and + /// the slot merge exercises the fused inline bloom path with N slot inputs. Failures + /// here flag mis-cached keys, missed bound refresh after MoveNext, or + /// destruct-barrier/slot-bound mismatches in MergeEntries. + /// + [TestCase(8)] + [TestCase(16)] + [TestCase(32)] + public void TryCompactPersistedSnapshots_MergesNBaseSnapshots(int n) + { + // CompactSize=4. n is a power of 2 in {8, 16, 32}, so n & -n == n: block n's natural + // window covers the whole (0, n] range and DoCompactSnapshot triggers a single merge. + using FlatTestContainer tier = new( + arenaFileSizeBytes: 256 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 4 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId prev = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= n; i++) + { + StateId next = new(i, Keccak.Compute($"s{i}")); + SnapshotContent c = new(); + c.Accounts[TestItem.Addresses[i - 1]] = Build.An.Account.WithBalance((UInt256)(i * 100)).TestObject; + // Shared overlapping account: same AddressA every block, distinct balance and + // a distinct slot — drives matchCount == N through MergeEntries, + // and the slot merge sees N inputs with N unique slot keys. + c.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance((UInt256)i).TestObject; + c.Storages[(TestItem.AddressA, (UInt256)i)] = new SlotValue(new byte[] { (byte)i }); + tier.ConvertToPersistedBase(new Snapshot(prev, next, c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = next; + } + + compactor.DoCompactSnapshot(prev); + + Assert.That(repo.TryLeasePersistedState(prev, SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True); + try + { + Assert.That(compacted!.From.BlockNumber, Is.EqualTo(0)); + Assert.That(compacted.To.BlockNumber, Is.EqualTo(n)); + + for (int i = 1; i <= n; i++) + { + Assert.That(compacted.TryGetAccount(TestItem.Addresses[i - 1], out _), Is.True, + $"Account from block {i} missing"); + } + + Assert.That(compacted.TryGetAccount(TestItem.AddressA, out Account? a), Is.True); + Assert.That(a!.Balance, Is.EqualTo((UInt256)n), "Newest balance must win on the overlapping account"); + + for (int i = 1; i <= n; i++) + { + SlotValue slot = default; + Assert.That(compacted.TryGetSlot(TestItem.AddressA, (UInt256)i, ref slot), Is.True, + $"Slot {i} must survive merge"); + Assert.That(slot.AsReadOnlySpan.ToArray(), Is.EqualTo(new SlotValue(new byte[] { (byte)i }).AsReadOnlySpan.ToArray()), + $"Slot {i} value mismatch"); + } + } + finally { compacted!.Dispose(); } + } + + /// + /// Regression for large-tier boundary compaction of an address with 256k sequential + /// storage slots. Each big-endian-contiguous run of 65536 slots forms one dense 30-byte + /// slot-prefix group; merging the per-block slices accumulates a group's inner sub-slot + /// table past ArenaBufferWriter's 1 MiB buffer. No single source snapshot crosses + /// that threshold (16384 slots per block), so the oversized value first appears inside + /// NWayNestedStreamingSlotMerge during the merge — the mainnet crash site. + /// + [Test] + public void DoCompactSnapshot_SequentialSlotsAcrossDensePrefixGroups_RoundTrips() + { + const int snapshotCount = 16; + const int slotsPerSnapshot = 16 * 1024; // 16 × 16384 = 256k merged slots + + // 64 MiB shared arena: the per-block snapshots and the ~10 MiB compacted output + // stay below the 512 MiB dedicated-arena threshold, so each must fit a shared file. + using FlatTestContainer tier = new( + arenaFileSizeBytes: 64 * 1024 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 4 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + // Each block writes a contiguous 16384-slot slice on AddressA. A slice stays well + // under ArenaBufferWriter's 1 MiB buffer, so every per-block build succeeds; only + // the merged 65536-slot prefix groups cross the threshold. + StateId prev = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= snapshotCount; i++) + { + StateId next = new(i, Keccak.Compute($"s{i}")); + SnapshotContent c = new(); + TestFixtureHelpers.AddSequentialSlots(c, TestItem.AddressA, + firstSlot: (i - 1) * slotsPerSnapshot + 1, count: slotsPerSnapshot); + tier.ConvertToPersistedBase( + new Snapshot(prev, next, c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = next; + } + + compactor.DoCompactSnapshot(prev); + + Assert.That(repo.TryLeasePersistedState(prev, SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True); + try + { + int totalSlots = snapshotCount * slotsPerSnapshot; + foreach (int probe in new[] { 1, 65535, 65536, 131072, totalSlots }) + { + SlotValue slot = default; + Assert.That(compacted!.TryGetSlot(TestItem.AddressA, (UInt256)probe, ref slot), Is.True, $"slot {probe} missing"); + Assert.That(slot.AsReadOnlySpan.SequenceEqual(TestFixtureHelpers.SequentialSlotValue(probe)), Is.True, + $"slot {probe} value mismatch"); + } + } + finally { compacted!.Dispose(); } + } + + /// + /// Regression for bloom completeness on a single matching source (matchCount==1), which + /// routes through the value mergers' MergeValues like any other key. We pack + /// AddressA into one source with slots plus storage-trie nodes at every depth tier (top / + /// compact / fallback) and pair it with an unrelated address in the second source so that + /// matchCount==1 for AddressA. The merge must still bloom-add the address key, every slot + /// key, and all three storage-trie sub-tag node keys. The bloom manager is shared with the + /// compactor so bloomCapacity is non-zero and the merger produces a real + /// (non-AlwaysTrue) bloom we can probe. + /// + [Test] + public void Compact_SingleSourceAddress_AddsAllSubTagBloomKeys() + { + using FlatTestContainer tier = new( + arenaFileSizeBytes: 64 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 1, PersistedSnapshotMaxCompactSize = 2 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + Hash256 addrHash256 = Keccak.Compute(TestItem.AddressA.Bytes); + TreePath shortPath = new(Keccak.Compute("trie_top"), 4); // → StorageCompactSubTag (8-byte key; storage has no top tier) + TreePath compactPath = new(Keccak.Compute("trie_compact"), 10); // → StorageCompactSubTag (8-byte key) + TreePath fallbackPath = new(Keccak.Compute("trie_fb"), 20); // → StorageFallbackSubTag (33-byte key) + UInt256 slotIndex = 7; + + SnapshotContent c0 = new(); + c0.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(100).TestObject; + c0.Storages[(TestItem.AddressA, slotIndex)] = new SlotValue(new byte[] { 0x42 }); + c0.StorageNodes[(addrHash256, shortPath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + c0.StorageNodes[(addrHash256, compactPath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x81]); + c0.StorageNodes[(addrHash256, fallbackPath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x82]); + + // Different address in the second source so AddressA has matchCount==1 (single + // matching source) while still having ≥ 2 sources to compact. + SnapshotContent c1 = new(); + c1.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(200).TestObject; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("s1")); + StateId s2 = new(2, Keccak.Compute("s2")); + tier.ConvertToPersistedBase(new Snapshot(s0, s1, c0, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + tier.ConvertToPersistedBase(new Snapshot(s1, s2, c1, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + + compactor.DoCompactSnapshot(s2); + + Assert.That(repo.TryLeasePersistedState(s2, SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True); + using (compacted) + { + BloomFilter bloom = compacted!.Bloom; + Assert.That(bloom.Count, Is.GreaterThan(0), + "Compacted snapshot must have a real bloom — the merge populates it from both sources"); + ValueHash256 addrHash = ValueKeccak.Compute(TestItem.AddressA.Bytes); + ulong addrKey = PersistedSnapshotBloomBuilder.AddressKey(TestItem.AddressA); + + Assert.Multiple(() => + { + Assert.That(bloom.MightContain(addrKey), Is.True, "Address key"); + Assert.That(bloom.MightContain(PersistedSnapshotBloomBuilder.SlotKey(addrKey, slotIndex)), Is.True, "Slot key"); + Assert.That(bloom.MightContain(PersistedSnapshotBloomBuilder.StorageNodeKey(in addrHash, in shortPath)), Is.True, + "Storage-trie short (compact) — fails when sibling TrySeek bound isn't reset between sub-tag seeks"); + Assert.That(bloom.MightContain(PersistedSnapshotBloomBuilder.StorageNodeKey(in addrHash, in compactPath)), Is.True, + "Storage-trie compact"); + Assert.That(bloom.MightContain(PersistedSnapshotBloomBuilder.StorageNodeKey(in addrHash, in fallbackPath)), Is.True, + "Storage-trie fallback"); + }); + } + } + + /// + /// Regression for the 4 KiB page-alignment pad applied by the BTree builder + /// (BlockBuilder.Add → TryAlign) when an about-to-straddle entry is pushed + /// onto a fresh page. The leading pad bytes must be inert so the outer leaf's + /// ValueStart = MetadataStart − ValueLength derivation lands inside the value and + /// decoding succeeds. Drives many distinct single-source addresses (matchCount==1) through + /// compaction with non-trivial inner tables (slots + a storage-trie node each) so positions + /// sweep across multiple page boundaries — at least some entries trigger the pad code path, + /// and all must round-trip read intact post-compaction. + /// + [TestCase(40)] + [TestCase(120)] + public void Compact_SingleSourceAddress_PageAlignPaddingPreservesValues(int accountCount) + { + using FlatTestContainer tier = new( + arenaFileSizeBytes: 256 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 1, PersistedSnapshotMaxCompactSize = 2 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + // Source 0: accountCount addresses with varying slot counts so inner-table + // sizes span ~tens to ~hundreds of bytes — repeated fast-path writes + // sweep across 4 KiB page boundaries in the destination arena. + SnapshotContent c0 = new(); + for (int i = 0; i < accountCount; i++) + { + Address addr = TestItem.Addresses[i]; + c0.Accounts[addr] = Build.An.Account.WithBalance((UInt256)(i + 1)).TestObject; + int slots = 1 + (i % 7); + for (int s = 0; s < slots; s++) + c0.Storages[(addr, (UInt256)(s + 1))] = new SlotValue(new byte[] { (byte)((i * 13 + s) & 0xFF) }); + c0.StorageNodes[(Keccak.Compute(addr.Bytes), new TreePath(Keccak.Compute($"p{i}"), 4))] + = new TrieNode(NodeType.Leaf, [0xC1, (byte)(i & 0xFF)]); + } + + // Source 1: a single unrelated address so matchCount == 1 for every + // address in source 0 (drives them all through the fast path). + SnapshotContent c1 = new(); + c1.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(999).TestObject; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("p1")); + StateId s2 = new(2, Keccak.Compute("p2")); + tier.ConvertToPersistedBase(new Snapshot(s0, s1, c0, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + tier.ConvertToPersistedBase(new Snapshot(s1, s2, c1, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + + compactor.DoCompactSnapshot(s2); + + Assert.That(repo.TryLeasePersistedState(s2, SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True); + using (compacted) + { + Assert.Multiple(() => + { + for (int i = 0; i < accountCount; i++) + { + Address addr = TestItem.Addresses[i]; + Assert.That(compacted!.TryGetAccount(addr, out Account? a), Is.True, + $"Account {i} must survive fast-path compaction"); + Assert.That(a!.Balance, Is.EqualTo((UInt256)(i + 1)), + $"Account {i} balance mismatch — pad bytes leaked into the value range"); + + int slots = 1 + (i % 7); + for (int s = 0; s < slots; s++) + { + SlotValue slot = default; + Assert.That(compacted.TryGetSlot(addr, (UInt256)(s + 1), ref slot), Is.True, + $"Slot {s + 1} for account {i} must survive fast-path compaction"); + SlotValue expected = new(new byte[] { (byte)((i * 13 + s) & 0xFF) }); + Assert.That(slot.AsReadOnlySpan.ToArray(), + Is.EqualTo(expected.AsReadOnlySpan.ToArray()), + $"Slot value mismatch for account {i} slot {s + 1}"); + } + } + }); + } + } + + /// + /// Metadata invariants for the blob-arena layout: base snapshots carry no + /// noderefs flag and a single ref_ids entry (their own blob arena id); + /// the compacted snapshot carries the noderefs flag and a ref_ids set + /// equal to the union of source base-snapshot blob arena ids. + /// + [Test] + public void CompactedSnapshot_Metadata_NodeRefsFlagAndRefIdsUnion() + { + using FlatTestContainer tier = new( + arenaFileSizeBytes: 64 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 4 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId prev = new(0, Keccak.EmptyTreeHash); + StateId[] states = new StateId[9]; + states[0] = prev; + HashSet baseRefIds = []; + for (int i = 1; i <= 8; i++) + { + states[i] = new StateId(i, Keccak.Compute($"{i}")); + SnapshotContent c = new(); + c.Accounts[TestItem.Addresses[i - 1]] = Build.An.Account.WithBalance((UInt256)(i * 100)).TestObject; + c.StateNodes[new TreePath(Keccak.Compute($"path{i}"), 4)] = new TrieNode(NodeType.Leaf, [(byte)(0xC1), (byte)i]); + tier.ConvertToPersistedBase(new Snapshot(prev, states[i], c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = states[i]; + } + + for (int i = 1; i <= 8; i++) + { + Assert.That(repo.TryLeasePersistedState(states[i], SnapshotTier.PersistedBase, out PersistedSnapshot? baseSnap), Is.True); + using (baseSnap) + { + using WholeReadSession session = baseSnap!.BeginWholeReadSession(); + WholeReadSessionReader reader = session.CreateReader(); + ushort[]? ids = TestFixtureHelpers.ReadRefIdsFromMetadata(in reader); + Assert.That(ids, Is.Not.Null.And.Length.EqualTo(1), + $"Base snapshot {i} must carry exactly one blob-arena ref_id"); + baseRefIds.Add(ids![0]); + } + } + + compactor.DoCompactSnapshot(states[8]); + + Assert.That(repo.TryLeasePersistedState(states[8], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True); + using (compacted) + { + using WholeReadSession session = compacted!.BeginWholeReadSession(); + WholeReadSessionReader reader = session.CreateReader(); + ushort[]? mergedIds = TestFixtureHelpers.ReadRefIdsFromMetadata(in reader); + Assert.That(mergedIds, Is.Not.Null); + Assert.That(new HashSet(mergedIds!), Is.EquivalentTo(baseRefIds), + "Compacted ref_ids must equal the union of source base blob-arena ids"); + } + } + + private static IEnumerable MergeValidationTestCases() + { + // Basic: two snapshots with overlapping accounts — newer balance wins. + { + SnapshotContent c0 = new(); + c0.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(100).TestObject; + SnapshotContent c1 = new(); + c1.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(200).TestObject; + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryGetAccount(TestItem.AddressA, out Account? a), Is.True); + Assert.That(a!.Balance, Is.EqualTo((UInt256)200)); + })) + .SetName("Merge_AccountOverride"); + } + + // Regression: advance-corrupts-minKey bug in NWayPackedArrayMerge (StateTopNodes). + // snapshot[0] has paths {A, B}, snapshot[1] has only {B} with different RLP. + { + TreePath pathA = new(Hash256.Zero, 4); + TreePath pathB = new(new Hash256("0x1000000000000000000000000000000000000000000000000000000000000000"), 4); + SnapshotContent c0 = new(); + c0.StateNodes[pathA] = new TrieNode(NodeType.Leaf, [0xC0]); + c0.StateNodes[pathB] = new TrieNode(NodeType.Leaf, [0xC0]); + SnapshotContent c1 = new(); + c1.StateNodes[pathB] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryLoadStateNodeRlp(pathA, out byte[]? rlpA), Is.True); + Assert.That(rlpA, Is.EqualTo(new byte[] { 0xC0 }), "State node only in older source must survive"); + Assert.That(s.TryLoadStateNodeRlp(pathB, out byte[]? rlpB), Is.True); + Assert.That(rlpB, Is.EqualTo(new byte[] { 0xC1, 0x80 }), "Overlapping state node — newer RLP must win"); + })) + .SetName("Merge_AdvanceOrder_StateTopNodes"); + } + + // Regression: same bug in NWayInnerMerge (StorageNodes inner merge). + { + Hash256 storageAddr = Keccak.Compute("storageAddr"); + TreePath pathA = new(Hash256.Zero, 8); + TreePath pathB = new(new Hash256("0x1000000000000000000000000000000000000000000000000000000000000000"), 8); + SnapshotContent c0 = new(); + c0.StorageNodes[(storageAddr, pathA)] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + c0.StorageNodes[(storageAddr, pathB)] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + SnapshotContent c1 = new(); + c1.StorageNodes[(storageAddr, pathB)] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x81]); + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryLoadStorageNodeRlp(storageAddr.ValueHash256, pathA, out byte[]? rlpA), Is.True); + Assert.That(rlpA, Is.EqualTo(new byte[] { 0xC1, 0x80 }), "Storage node only in older source must survive"); + Assert.That(s.TryLoadStorageNodeRlp(storageAddr.ValueHash256, pathB, out byte[]? rlpB), Is.True); + Assert.That(rlpB, Is.EqualTo(new byte[] { 0xC2, 0x80, 0x81 }), "Overlapping storage node — newer RLP must win"); + })) + .SetName("Merge_AdvanceOrder_StorageNodes"); + } + + // Single-source per-sub-tag merge: the same addressHash is present in both sources + // (matchCount==2 for the storage-trie column). The Fallback (33-byte key) sub-tag and a + // c0-only node in the Compact (8-byte key) sub-tag are present only in the older source, + // while another Compact node overlaps both. This drives MergeStorageSubTag with active==1 + // for Fallback and active==2 for Compact (with both a unique and an overlapping node in the + // compact width). Storage has no top tier — a length-4 path lands in the compact sub-tag. + { + Hash256 addrHash = Keccak.Compute(TestItem.AddressA.Bytes); + TreePath shortPath = new(Keccak.Compute("trie_top"), 4); // StorageCompactSubTag (8-byte key; c0-only) + TreePath compactPath = new(Keccak.Compute("trie_compact"), 10); // StorageCompactSubTag (8-byte key; overlaps) + TreePath fallbackPath = new(Keccak.Compute("trie_fb"), 20); // StorageFallbackSubTag (33-byte key) + SnapshotContent c0 = new(); + c0.StorageNodes[(addrHash, shortPath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + c0.StorageNodes[(addrHash, compactPath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x81]); + c0.StorageNodes[(addrHash, fallbackPath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x82]); + SnapshotContent c1 = new(); + c1.StorageNodes[(addrHash, compactPath)] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x81]); + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryLoadStorageNodeRlp(addrHash.ValueHash256, shortPath, out byte[]? shortRlp), Is.True); + Assert.That(shortRlp, Is.EqualTo(new byte[] { 0xC1, 0x80 }), "c0-only compact node (shortPath) must survive"); + Assert.That(s.TryLoadStorageNodeRlp(addrHash.ValueHash256, compactPath, out byte[]? compactRlp), Is.True); + Assert.That(compactRlp, Is.EqualTo(new byte[] { 0xC2, 0x80, 0x81 }), "Compact sub-tag (active==2) — newer wins"); + Assert.That(s.TryLoadStorageNodeRlp(addrHash.ValueHash256, fallbackPath, out byte[]? fallbackRlp), Is.True); + Assert.That(fallbackRlp, Is.EqualTo(new byte[] { 0xC1, 0x82 }), "Fallback sub-tag (active==1) must survive"); + })) + .SetName("Merge_SingleSourceSubTag_CompactAndFallback"); + } + + // Mixed: all data types across two snapshots. + { + Hash256 storageAddr = Keccak.Compute("storageAddr"); + TreePath statePath = new(Keccak.Compute("statePath"), 4); + TreePath storagePath = new(Hash256.Zero, 4); + SnapshotContent c0 = new(); + c0.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(100).TestObject; + c0.Storages[(TestItem.AddressA, 1)] = new SlotValue(new byte[] { 0x42 }); + c0.SelfDestructedStorageAddresses[TestItem.AddressB] = true; + c0.StateNodes[statePath] = new TrieNode(NodeType.Leaf, [0xC0, 0x80]); + c0.StorageNodes[(storageAddr, storagePath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + SnapshotContent c1 = new(); + c1.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance((UInt256)200).TestObject; + c1.Storages[(TestItem.AddressA, 2)] = new SlotValue(new byte[] { 0x99 }); + c1.StateNodes[statePath] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + c1.StorageNodes[(storageAddr, storagePath)] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x81]); + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryGetAccount(TestItem.AddressA, out Account? a), Is.True); + Assert.That(a!.Balance, Is.EqualTo((UInt256)200), "Account override"); + + SlotValue slot1 = default; + Assert.That(s.TryGetSlot(TestItem.AddressA, 1, ref slot1), Is.True, "Older-only slot must survive (no self-destruct on A)"); + Assert.That(slot1.AsReadOnlySpan.ToArray(), Is.EqualTo(new SlotValue(new byte[] { 0x42 }).AsReadOnlySpan.ToArray())); + + SlotValue slot2 = default; + Assert.That(s.TryGetSlot(TestItem.AddressA, 2, ref slot2), Is.True); + Assert.That(slot2.AsReadOnlySpan.ToArray(), Is.EqualTo(new SlotValue(new byte[] { 0x99 }).AsReadOnlySpan.ToArray())); + + Assert.That(s.TryGetSelfDestructFlag(TestItem.AddressB), Is.Not.Null, + "Self-destruct flag for B (set in c0) must be present after compaction"); + Assert.That(s.TryGetAccount(TestItem.AddressB, out _), Is.False, + "self-destruct-only address reports no account change"); + + Assert.That(s.TryLoadStateNodeRlp(statePath, out byte[]? stateRlp), Is.True); + Assert.That(stateRlp, Is.EqualTo(new byte[] { 0xC1, 0x80 }), "State node — newer wins"); + + Assert.That(s.TryLoadStorageNodeRlp(storageAddr.ValueHash256, storagePath, out byte[]? storageRlp), Is.True); + Assert.That(storageRlp, Is.EqualTo(new byte[] { 0xC2, 0x80, 0x81 }), "Storage node — newer wins"); + })) + .SetName("Merge_MixedDataTypes"); + } + + // Cross-source per-address merge: an account-only entry in the older source and a + // self-destruct-only (account-Absent) entry in the newer source must merge to the older + // account paired with the newer self-destruct — exercising the newest-non-Absent account rule. + { + SnapshotContent c0 = new(); + c0.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(500).WithNonce(3).TestObject; + SnapshotContent c1 = new(); + c1.SelfDestructedStorageAddresses[TestItem.AddressA] = true; // "new"; no account change in c1 + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryGetAccount(TestItem.AddressA, out Account? a), Is.True, + "older account survives the newer self-destruct-only entry"); + Assert.That(a!.Balance, Is.EqualTo((UInt256)500)); + Assert.That(a.Nonce, Is.EqualTo((UInt256)3)); + Assert.That(s.TryGetSelfDestructFlag(TestItem.AddressA), Is.True, "newer self-destruct (new) wins the flag"); + })) + .SetName("Merge_AccountOnly_Then_SelfDestructOnly"); + } + + // Overlapping state node (newer wins) + non-overlapping accounts (both preserved). + { + TreePath path = new(Keccak.Compute("path"), 4); + SnapshotContent c0 = new(); + c0.StateNodes[path] = new TrieNode(NodeType.Leaf, [0xC0]); + c0.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(100).TestObject; + SnapshotContent c1 = new(); + c1.StateNodes[path] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + c1.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(200).TestObject; + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryLoadStateNodeRlp(path, out byte[]? rlp), Is.True); + Assert.That(rlp, Is.EqualTo(new byte[] { 0xC1, 0x80 }), "Newer state-node RLP wins"); + Assert.That(s.TryGetAccount(TestItem.AddressA, out Account? a), Is.True); + Assert.That(a!.Balance, Is.EqualTo((UInt256)100)); + Assert.That(s.TryGetAccount(TestItem.AddressB, out Account? b), Is.True); + Assert.That(b!.Balance, Is.EqualTo((UInt256)200)); + })) + .SetName("Merge_NewerOverridesOlder"); + } + + // Two distinct state node paths, both survive merge. + { + TreePath p1 = new(Keccak.Compute("path1"), 4); + TreePath p2 = new(Keccak.Compute("path2"), 4); + SnapshotContent c0 = new(); + c0.StateNodes[p1] = new TrieNode(NodeType.Leaf, [0xC0]); + SnapshotContent c1 = new(); + c1.StateNodes[p2] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryLoadStateNodeRlp(p1, out byte[]? r1), Is.True); + Assert.That(r1, Is.EqualTo(new byte[] { 0xC0 })); + Assert.That(s.TryLoadStateNodeRlp(p2, out byte[]? r2), Is.True); + Assert.That(r2, Is.EqualTo(new byte[] { 0xC1, 0x80 })); + })) + .SetName("Merge_PreservesNonOverlapping"); + } + + // Older slot cleared by self-destruct, newer slot + flag preserved. + { + SnapshotContent c0 = new(); + c0.Storages[(TestItem.AddressA, 1)] = new SlotValue(new byte[] { 0x42 }); + SnapshotContent c1 = new(); + c1.SelfDestructedStorageAddresses[TestItem.AddressA] = false; + c1.Storages[(TestItem.AddressA, 2)] = new SlotValue(new byte[] { 0x99 }); + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + SlotValue slot1 = default; + Assert.That(s.TryGetSlot(TestItem.AddressA, 1, ref slot1), Is.False, "Older slot must be cleared by newer destruct"); + SlotValue slot2 = default; + Assert.That(s.TryGetSlot(TestItem.AddressA, 2, ref slot2), Is.True); + Assert.That(slot2.AsReadOnlySpan.ToArray(), Is.EqualTo(new SlotValue(new byte[] { 0x99 }).AsReadOnlySpan.ToArray())); + Assert.That(s.TryGetSelfDestructFlag(TestItem.AddressA), Is.False, "Destruct flag must be present and value must be `false` (destructed)"); + })) + .SetName("Merge_SelfDestruct_ClearsOlderStorage"); + } + + // Barrier isolation: a self-destruct truncates only its own address's older slots; a sibling + // address with no self-destruct keeps its slots. Slots live in their own column now, so this + // exercises the merge's cross-address self-destruct-barrier walk. + { + SnapshotContent c0 = new(); + c0.Storages[(TestItem.AddressA, 1)] = new SlotValue(new byte[] { 0x11 }); + c0.Storages[(TestItem.AddressB, 1)] = new SlotValue(new byte[] { 0x22 }); + SnapshotContent c1 = new(); + c1.SelfDestructedStorageAddresses[TestItem.AddressA] = false; + c1.Storages[(TestItem.AddressA, 2)] = new SlotValue(new byte[] { 0x33 }); + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + SlotValue a1 = default, a2 = default, b1 = default; + Assert.That(s.TryGetSlot(TestItem.AddressA, 1, ref a1), Is.False, "A's older slot truncated by A's destruct"); + Assert.That(s.TryGetSlot(TestItem.AddressA, 2, ref a2), Is.True, "A's post-destruct slot survives"); + Assert.That(a2.AsReadOnlySpan.ToArray(), Is.EqualTo(new SlotValue(new byte[] { 0x33 }).AsReadOnlySpan.ToArray())); + Assert.That(s.TryGetSlot(TestItem.AddressB, 1, ref b1), Is.True, "B (no destruct) keeps its slot"); + Assert.That(b1.AsReadOnlySpan.ToArray(), Is.EqualTo(new SlotValue(new byte[] { 0x22 }).AsReadOnlySpan.ToArray())); + })) + .SetName("Merge_SelfDestruct_BarrierIsolation_AcrossAddresses"); + } + + // Newer true flag doesn't overwrite older false (destructed) — TryAdd semantics. + { + SnapshotContent c0 = new(); + c0.SelfDestructedStorageAddresses[TestItem.AddressA] = false; + SnapshotContent c1 = new(); + c1.SelfDestructedStorageAddresses[TestItem.AddressA] = true; + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryGetSelfDestructFlag(TestItem.AddressA), Is.False, + "Older `false` (destructed) flag must win over newer `true` (new-account) flag"); + })) + .SetName("Merge_SelfDestruct_TryAddSemantics"); + } + + // Storage trie nodes survive self-destruct (only storage *slot* data is cleared). + { + Hash256 addrHash = Keccak.Compute(TestItem.AddressA.Bytes); + TreePath storagePath = new(Keccak.Compute("storage_path"), 4); + SnapshotContent c0 = new(); + c0.StorageNodes[(addrHash, storagePath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + SnapshotContent c1 = new(); + c1.SelfDestructedStorageAddresses[TestItem.AddressA] = false; + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryLoadStorageNodeRlp(addrHash.ValueHash256, storagePath, out byte[]? rlp), Is.True, + "Storage trie node must survive self-destruct of the account"); + Assert.That(rlp, Is.EqualTo(new byte[] { 0xC1, 0x80 })); + })) + .SetName("Merge_SelfDestruct_StorageNodesKept"); + } + + // Single-source, no-slot verbatim fast path: A (account-only EOA) and C (account + + // self-destruct flag) appear in only one source and carry no slots, so each is + // byte-copied verbatim through the outer builder; B keeps the second source non-empty. + { + SnapshotContent c0 = new(); + c0.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(100).TestObject; + c0.Accounts[TestItem.AddressC] = Build.An.Account.WithBalance(300).TestObject; + c0.SelfDestructedStorageAddresses[TestItem.AddressC] = false; + SnapshotContent c1 = new(); + c1.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(200).TestObject; + yield return new TestCaseData( + (object)new[] { c0, c1 }, + (Action)(s => + { + Assert.That(s.TryGetAccount(TestItem.AddressA, out Account? a), Is.True); + Assert.That(a!.Balance, Is.EqualTo((UInt256)100), "Account-only EOA copied verbatim"); + SlotValue slotA = default; + Assert.That(s.TryGetSlot(TestItem.AddressA, 1, ref slotA), Is.False, "EOA has no slots"); + + Assert.That(s.TryGetAccount(TestItem.AddressC, out Account? c), Is.True); + Assert.That(c!.Balance, Is.EqualTo((UInt256)300), "Account survives verbatim copy"); + Assert.That(s.TryGetSelfDestructFlag(TestItem.AddressC), Is.False, + "Self-destruct flag survives verbatim copy alongside the account sub-tag"); + + Assert.That(s.TryGetAccount(TestItem.AddressB, out Account? b), Is.True); + Assert.That(b!.Balance, Is.EqualTo((UInt256)200)); + })) + .SetName("Merge_SingleSource_NoSlot_Verbatim"); + } + } + + [TestCaseSource(nameof(MergeValidationTestCases))] + public void MergeSnapshots_ValidatesCorrectly(SnapshotContent[] contents, Action assertCompacted) + { + // maxCompactSize == 2 — only a size-2 compaction is attempted, so + // exactly two consecutive base snapshots are merged into one compacted snapshot. + using FlatTestContainer tier = new( + arenaFileSizeBytes: 64 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 1, PersistedSnapshotMaxCompactSize = 2 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId[] states = new StateId[contents.Length + 1]; + states[0] = new StateId(0, Keccak.EmptyTreeHash); + for (int i = 0; i < contents.Length; i++) + { + states[i + 1] = new StateId(i + 1, Keccak.Compute($"{i + 1}")); + tier.ConvertToPersistedBase( + new Snapshot(states[i], states[i + 1], contents[i], _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + } + + compactor.DoCompactSnapshot(states[contents.Length]); + + Assert.That(repo.TryLeasePersistedState(states[contents.Length], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True, + "Expected a compacted snapshot to exist after DoCompactSnapshot"); + using (compacted) + { + assertCompacted(compacted!); + } + } + + // Config: compactSize=1 (PersistenceManager boundary), maxCompactSize=8. + // blockNumber=8 → 8 & -8 = 8, so the compaction window is [0, 8]. + // + // presentBlocks: which block-slots are populated (snapshot From=states[b-1], To=states[b]). + // The window need not be fully populated — whatever contiguous chain of ≥2 snapshots + // assembles back from block 8 is compacted into a single snapshot. + // expectCompacted=false means no compaction expected. + private static IEnumerable PartialWindowCompactionCases() + { + // Full 8-block range present: compacts the whole window. Linked s0→s8. + yield return new TestCaseData(new[] { 1, 2, 3, 4, 5, 6, 7, 8 }, true, 0L, 8L) + .SetName("PartialWindow_FullRange_Compacts0To8"); + + // Blocks 3–8 present: the chain reaches back to s2, a non-power-of-2 boundary. + // The old power-of-2 step-down would have compacted only [4,8]; now the whole + // assembled chain [2,8] is compacted instead. + yield return new TestCaseData(new[] { 3, 4, 5, 6, 7, 8 }, true, 2L, 8L) + .SetName("PartialWindow_NonPowerOfTwoStart_Compacts2To8"); + + // Only blocks 5–8 present: chain reaches back to s4. Compacts [4,8]. + yield return new TestCaseData(new[] { 5, 6, 7, 8 }, true, 4L, 8L) + .SetName("PartialWindow_Half_Compacts4To8"); + + // Only blocks 7–8 present: chain reaches back to s6. Compacts [6,8]. + yield return new TestCaseData(new[] { 7, 8 }, true, 6L, 8L) + .SetName("PartialWindow_Quarter_Compacts6To8"); + + // Only 1 block present: no pair available, no compaction. + yield return new TestCaseData(new[] { 8 }, false, 0L, 0L) + .SetName("PartialWindow_NoRange_NoCompact"); + } + + [TestCaseSource(nameof(PartialWindowCompactionCases))] + public void DoCompactSnapshot_CompactsPartialWindow( + int[] presentBlocks, bool expectCompacted, long expectedFromBlock, long expectedToBlock) + { + // CompactSize=1 makes every block a boundary; block 8 → window [0, 8]. + using FlatTestContainer tier = new( + arenaFileSizeBytes: 64 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 1, PersistedSnapshotMaxCompactSize = 8 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId[] states = new StateId[9]; + states[0] = new StateId(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= 8; i++) + states[i] = new StateId(i, Keccak.Compute($"{i}")); + + foreach (int block in presentBlocks) + { + SnapshotContent content = new(); + content.Accounts[TestItem.Addresses[block - 1]] = Build.An.Account.WithBalance((ulong)block * 100).TestObject; + tier.ConvertToPersistedBase(new Snapshot(states[block - 1], states[block], content, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + } + + compactor.DoCompactSnapshot(states[8]); + + if (!expectCompacted) + { + Assert.That(repo.TryLeasePersistedState(states[8], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? none), Is.False, + "Expected no compacted snapshot"); + _ = none; + } + else + { + Assert.That(repo.TryLeasePersistedState(states[8], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True, + "Expected a compacted snapshot"); + Assert.That(compacted!.From.BlockNumber, Is.EqualTo(expectedFromBlock)); + Assert.That(compacted.To.BlockNumber, Is.EqualTo(expectedToBlock)); + compacted.Dispose(); + } + } + + // A [0,8] large-compacted (To=8) survives until persistence passes block 8, so its From=0 sits + // below any persistence point in (0, 8]. The widest-skip-first assemble walk would follow that + // edge and drag block 16's compaction down to From=0. Clamping the window to the persistence + // point makes the walk reject the below-P edge and assemble from P upward via the bases instead. + private static IEnumerable ClampToPersistenceCases() + { + // P at genesis: no clamp, the walk follows the [0,8] large-compacted skip-pointer to From=0. + yield return new TestCaseData(0L, 0L).SetName("ClampToPersistence_GenesisP_NoClamp_From0"); + // P inside the [0,8] span: the below-P edge is skipped, the walk wins at From=P via the bases. + yield return new TestCaseData(4L, 4L).SetName("ClampToPersistence_PInsideSpan_ClampsFrom4"); + // P at the [0,8] To boundary: still clamped, never reaching the From=0 edge. + yield return new TestCaseData(8L, 8L).SetName("ClampToPersistence_PAtBoundary_ClampsFrom8"); + } + + [TestCaseSource(nameof(ClampToPersistenceCases))] + public void DoCompactSnapshot_ClampsWindowToPersistencePoint(long persistedBlock, long expectedFromBlock) + { + // CompactSize=1 makes every block a boundary; MaxCompactSize=16 so block 16's window is [0, 16]. + using FlatTestContainer tier = new( + arenaFileSizeBytes: 256 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton( + ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 1, PersistedSnapshotMaxCompactSize = 16 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId[] states = new StateId[17]; + states[0] = new StateId(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= 16; i++) + states[i] = new StateId(i, Keccak.Compute($"{i}")); + + // Build base snapshots [0..8], then the [0,8] large-compacted skip-pointer. + for (int i = 1; i <= 8; i++) + BuildBase(tier, states, i); + compactor.DoCompactSnapshot(states[8], persistedBlockNumber: 0); + Assert.That(repo.TryLeasePersistedState(states[8], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? seed), Is.True, + "precondition: the [0,8] large-compacted skip-pointer must exist"); + seed!.Dispose(); + + // Build base snapshots [9..16] so narrower edges exist above the persistence point. + for (int i = 9; i <= 16; i++) + BuildBase(tier, states, i); + + // Compact block 16's [0,16] window, clamped to the persistence point. + compactor.DoCompactSnapshot(states[16], persistedBlockNumber: persistedBlock); + + Assert.That(repo.TryLeasePersistedState(states[16], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True, + "Expected a large-compacted snapshot at block 16"); + using (compacted) + { + Assert.That(compacted!.To.BlockNumber, Is.EqualTo(16)); + Assert.That(compacted.From.BlockNumber, Is.EqualTo(expectedFromBlock), + persistedBlock == 0 + ? "Unclamped: the walk follows the [0,8] large-compacted edge down to From=0" + : "Clamped: the below-P [0,8] edge is rejected and the walk wins at From=P"); + } + } + + private void BuildBase(FlatTestContainer tier, StateId[] states, int block) + { + SnapshotContent content = new(); + content.Accounts[TestItem.Addresses[block - 1]] = Build.An.Account.WithBalance((ulong)block * 100).TestObject; + tier.ConvertToPersistedBase(new Snapshot(states[block - 1], states[block], content, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + } + + /// + /// After compaction, / + /// must dereference the merged + /// snapshot's per-key NodeRefs through the union of referenced blob arenas + /// and yield the newest-writer RLP for overlapping paths, the only-writer RLP for + /// non-overlapping paths. + /// + [Test] + public void CompactedSnapshot_TrieNodeResolution_NewerOverridesOlder() + { + using FlatTestContainer tier = new( + arenaFileSizeBytes: 64 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 4 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + TreePath sharedStatePath = new(Keccak.Compute("shared_state"), 4); + TreePath onlyOldStatePath = new(Keccak.Compute("only_old_state"), 4); + TreePath onlyNewStatePath = new(Keccak.Compute("only_new_state"), 4); + Hash256 storageTrieAddr = Keccak.Compute("storage_trie_addr"); + TreePath sharedStoragePath = new(Keccak.Compute("shared_storage"), 6); + + byte[] oldStateRlp = [0xC1, 0x80]; + byte[] newStateRlp = [0xC2, 0x81, 0x42]; + byte[] onlyOldRlp = [0xC1, 0x33]; + byte[] onlyNewRlp = [0xC1, 0x55]; + byte[] oldStorageRlp = [0xC1, 0x80]; + byte[] newStorageRlp = [0xC2, 0x82, 0x99]; + + StateId prev = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= 8; i++) + { + StateId next = new(i, Keccak.Compute($"{i}")); + SnapshotContent c = new(); + if (i == 1) + { + c.StateNodes[sharedStatePath] = new TrieNode(NodeType.Leaf, oldStateRlp); + c.StateNodes[onlyOldStatePath] = new TrieNode(NodeType.Leaf, onlyOldRlp); + c.StorageNodes[(storageTrieAddr, sharedStoragePath)] = new TrieNode(NodeType.Leaf, oldStorageRlp); + } + else if (i == 8) + { + c.StateNodes[sharedStatePath] = new TrieNode(NodeType.Leaf, newStateRlp); + c.StateNodes[onlyNewStatePath] = new TrieNode(NodeType.Leaf, onlyNewRlp); + c.StorageNodes[(storageTrieAddr, sharedStoragePath)] = new TrieNode(NodeType.Leaf, newStorageRlp); + } + else + { + c.Accounts[TestItem.Addresses[i - 1]] = Build.An.Account.WithBalance((UInt256)(i * 10)).TestObject; + } + tier.ConvertToPersistedBase(new Snapshot(prev, next, c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = next; + } + + compactor.DoCompactSnapshot(prev); + + Assert.That(repo.TryLeasePersistedState(prev, SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True); + using (compacted) + { + Assert.That(compacted!.TryLoadStateNodeRlp(sharedStatePath, out byte[]? sharedResult), Is.True); + Assert.That(sharedResult, Is.EqualTo(newStateRlp), + "Overlapping state-node path must resolve to newest writer's RLP"); + + Assert.That(compacted.TryLoadStateNodeRlp(onlyOldStatePath, out byte[]? oldOnly), Is.True); + Assert.That(oldOnly, Is.EqualTo(onlyOldRlp), + "State node only in the oldest source must survive the merge with its original RLP"); + + Assert.That(compacted.TryLoadStateNodeRlp(onlyNewStatePath, out byte[]? newOnly), Is.True); + Assert.That(newOnly, Is.EqualTo(onlyNewRlp), + "State node only in the newest source must survive the merge with its original RLP"); + + Assert.That(compacted.TryLoadStorageNodeRlp(storageTrieAddr.ValueHash256, sharedStoragePath, out byte[]? storageResult), Is.True); + Assert.That(storageResult, Is.EqualTo(newStorageRlp), + "Overlapping storage-node path must resolve to newest writer's RLP"); + } + } + + /// + /// Regression for the builder no-storage fast path in + /// PersistedSnapshotBuilder.WritePerAddressColumn: when an address has no + /// slots and no storage-trie nodes the per-address inner table is staged into a + /// pooled buffer so its length is known up-front, and the outer leaf entry applies + /// 4 KiB page-alignment padding. Drives many EOAs so writer positions sweep across + /// page boundaries; every address must round-trip read intact and every self-destruct + /// flag must survive the staging path. A mix of plain EOAs, EOA-with-SD and a few + /// contracts (which take the streaming path) confirms both branches coexist. + /// + [TestCase(40)] + [TestCase(120)] + public void WritePerAddressColumn_NoStorageFastPath_RoundTripsEoaSnapshot(int accountCount) + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 256 * 1024, blobFileSizeBytes: 4 * 1024 * 1024); + SnapshotRepository repo = tier.Repository; + + // Every 7th address gets storage (so the streaming path also fires) and the + // routing decision flips per-address; every 5th address gets a self-destruct + // flag (so the SD sub-tag is exercised on the staged DenseByteIndex). + SnapshotContent c = new(); + for (int i = 0; i < accountCount; i++) + { + Address addr = TestItem.Addresses[i]; + c.Accounts[addr] = Build.An.Account.WithBalance((UInt256)(i + 1)).TestObject; + if (i % 5 == 0) + c.SelfDestructedStorageAddresses[addr] = (i % 10 == 0); + if (i % 7 == 0) + c.Storages[(addr, 1)] = new SlotValue(new byte[] { (byte)(i & 0xFF) }); + } + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("p1")); + tier.ConvertToPersistedBase(new Snapshot(s0, s1, c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + + Assert.That(repo.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? built), Is.True); + using (built) + { + Assert.Multiple(() => + { + for (int i = 0; i < accountCount; i++) + { + Address addr = TestItem.Addresses[i]; + Assert.That(built!.TryGetAccount(addr, out Account? a), Is.True, + $"Account {i} ({(i % 7 == 0 ? "with-storage" : "no-storage")}) must survive WritePerAddressColumn"); + Assert.That(a!.Balance, Is.EqualTo((UInt256)(i + 1)), + $"Account {i} balance mismatch — pad bytes leaked into the value range"); + if (i % 5 == 0) + { + Assert.That(built.TryGetSelfDestructFlag(addr), Is.EqualTo((bool?)(i % 10 == 0)), + $"Self-destruct flag for account {i} must survive the staged DenseByteIndex path"); + } + if (i % 7 == 0) + { + SlotValue slot = default; + Assert.That(built.TryGetSlot(addr, 1, ref slot), Is.True, + $"Slot for storage-bearing account {i} must come back from the streaming path"); + SlotValue expected = new(new byte[] { (byte)(i & 0xFF) }); + Assert.That(slot.AsReadOnlySpan.ToArray(), Is.EqualTo(expected.AsReadOnlySpan.ToArray())); + } + } + }); + } + } + + /// + /// Regression for the merger no-storage fast path in + /// PersistedSnapshotMerger.NWayMergePerAddressColumn: two snapshots covering + /// the SAME set of EOAs collide on every address (matchCount > 1) without any + /// source contributing slots or storage-trie nodes, so the staged-and-padded helper + /// runs for every cursor address. Newest-wins on Account / first-non-empty on Address + /// preimage / TryAdd on SD must all hold after the staged DenseByteIndex round-trips. + /// + [TestCase(40)] + [TestCase(120)] + public void Compact_MultiSourceMerge_NoStorageFastPath_RoundTrips(int accountCount) + { + using FlatTestContainer tier = new( + arenaFileSizeBytes: 256 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 1, PersistedSnapshotMaxCompactSize = 2 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + // Both sources touch every address with a different balance — collision on + // every cursor address forces matchCount==2, and the absence of slots / + // storage-trie nodes in either source flips the no-storage routing on. + SnapshotContent c0 = new(); + SnapshotContent c1 = new(); + for (int i = 0; i < accountCount; i++) + { + Address addr = TestItem.Addresses[i]; + c0.Accounts[addr] = Build.An.Account.WithBalance((UInt256)(i + 1)).TestObject; + c1.Accounts[addr] = Build.An.Account.WithBalance((UInt256)((i + 1) * 1000)).TestObject; + // Every 5th address: set the destruct flag only in c0 (older). TryAdd + // semantics must preserve it through the merge with c1 (which doesn't set + // it), and the staged DenseByteIndex must emit it as sub-tag 0x03. + if (i % 5 == 0) + c0.SelfDestructedStorageAddresses[addr] = false; + } + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("p1")); + StateId s2 = new(2, Keccak.Compute("p2")); + tier.ConvertToPersistedBase(new Snapshot(s0, s1, c0, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + tier.ConvertToPersistedBase(new Snapshot(s1, s2, c1, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + + compactor.DoCompactSnapshot(s2); + + Assert.That(repo.TryLeasePersistedState(s2, SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? compacted), Is.True); + using (compacted) + { + Assert.Multiple(() => + { + for (int i = 0; i < accountCount; i++) + { + Address addr = TestItem.Addresses[i]; + Assert.That(compacted!.TryGetAccount(addr, out Account? a), Is.True, + $"Account {i} must survive the staged multi-source merge"); + Assert.That(a!.Balance, Is.EqualTo((UInt256)((i + 1) * 1000)), + $"Account {i}: newest balance (c1) must win — pad bytes must not leak into the value range"); + if (i % 5 == 0) + { + Assert.That(compacted.TryGetSelfDestructFlag(addr), Is.False, + $"Self-destruct flag for account {i} must survive the staged DenseByteIndex merge"); + } + } + }); + } + } + + /// + /// Regression for the offset-vs-block-number mismatch in + /// DoCompactSnapshot's startingBlockNumber. The alignment value comes + /// from the offset-shifted schedule but the start-of-window was computed in raw + /// block-number space — the previous + /// startingBlockNumber = ((blockNumber - 1) / alignment) * alignment formula + /// only matched the trigger's actual window when offset == 0. With a non-zero + /// offset it produced a span of (blockNumber mod alignment) instead of + /// alignment. + /// + /// Test geometry: offset=3, CompactSize=64, maxCompactSize=32. At block 45, + /// (45 + 3) & -(45 + 3) = 48 & -48 = 16, so alignment=16 fires. + /// Window must be (29, 45] (span 16), not the buggy (32, 45] (span 13). + /// + [Test] + public void DoCompactSnapshot_WithNonZeroScheduleOffset_StartingBlockSpansFullAlignment() + { + using FlatTestContainer tier = new( + arenaFileSizeBytes: 256 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 64, PersistedSnapshotMaxCompactSize = 32 }, 3))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + // 45 base snapshots, blocks 1..45. No intermediate compactions so + // AssemblePersistedSnapshotsForCompaction sees only bases. + StateId prev = new(0, Keccak.EmptyTreeHash); + StateId tip = prev; + for (int i = 1; i <= 45; i++) + { + StateId next = new(i, Keccak.Compute($"s{i}")); + SnapshotContent c = new(); + c.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance((UInt256)i).TestObject; + tier.ConvertToPersistedBase(new Snapshot(prev, next, c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = next; + if (i == 45) tip = next; + } + + // At block 45 with offset=3, alignment=16. Window must be (29, 45]. + compactor.DoCompactSnapshot(tip); + + Assert.That(repo.TryLeasePersistedState(tip, SnapshotTier.PersistedSmallCompacted, out PersistedSnapshot? compacted), Is.True); + try + { + Assert.That(compacted!.From.BlockNumber, Is.EqualTo(29), + "startingBlockNumber must be (blockNumber - alignment) — the left edge of the window the offset-shifted alignment trigger selects"); + Assert.That(compacted.To.BlockNumber, Is.EqualTo(45)); + Assert.That(compacted.To.BlockNumber - compacted.From.BlockNumber, Is.EqualTo(16), + "compacted span must equal alignment, not (blockNumber mod alignment)"); + } + finally { compacted!.Dispose(); } + } + + private static FlatTestContainer NewTier(int compactSize) => new( + arenaFileSizeBytes: 256 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = compactSize }, 0))); + + [Test] + public void DoCompactSnapshot_NoOp_WhenWindowSizeOneOrTooFewSnapshots() + { + using FlatTestContainer tier = NewTier(compactSize: 4); + PersistedSnapshotCompactor compactor = tier.Compactor; + + // Block 1: natural window size is 1 → nothing to merge. + compactor.DoCompactSnapshot(new StateId(1, Keccak.Compute("b1"))); + // Block 4: window size 4, but the empty repo has < 2 snapshots. + compactor.DoCompactSnapshot(new StateId(4, Keccak.Compute("b4"))); + + Assert.That(tier.Repository.PersistedSnapshotCount, Is.EqualTo(0), "no compaction should have run"); + } + + [Test] + public void DoCompactCompactSized_NoOp_WhenNotBoundaryOrTooFewSnapshots() + { + using FlatTestContainer tier = NewTier(compactSize: 4); + PersistedSnapshotCompactor compactor = tier.Compactor; + + compactor.DoCompactCompactSized(new StateId(3, Keccak.Compute("b3"))); // not a boundary + compactor.DoCompactCompactSized(new StateId(4, Keccak.Compute("b4"))); // boundary, but empty repo + + Assert.That(tier.Repository.PersistedSnapshotCount, Is.EqualTo(0), "no CompactSized snapshot should have been produced"); + } + + [Test] + public void DoCompactCompactSized_AtBoundary_ProducesCompactSizedSnapshot() + { + using FlatTestContainer tier = NewTier(compactSize: 4); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId prev = new(0, Keccak.EmptyTreeHash); + StateId tip = prev; + for (int i = 1; i <= 4; i++) + { + tip = new(i, Keccak.Compute($"p{i}")); + SnapshotContent c = new(); + c.Accounts[TestItem.Addresses[i - 1]] = Build.An.Account.WithBalance((UInt256)(i * 10)).TestObject; + tier.ConvertToPersistedBase(new Snapshot(prev, tip, c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = tip; + } + + compactor.DoCompactCompactSized(tip); + + Assert.That(repo.TryLeasePersistedState(tip, SnapshotTier.PersistedCompactSized, out PersistedSnapshot? compactSized), Is.True); + try + { + Assert.That(compactSized!.From.BlockNumber, Is.EqualTo(0)); + Assert.That(compactSized.To.BlockNumber, Is.EqualTo(4)); + for (int i = 1; i <= 4; i++) + Assert.That(compactSized.TryGetAccount(TestItem.Addresses[i - 1], out _), Is.True, $"account from block {i} missing"); + } + finally { compactSized!.Dispose(); } + } + + [Test] + public void DoCompactSnapshot_AtBoundary_NoAddressColumn_WarmsGracefully() + { + using FlatTestContainer tier = NewTier(compactSize: 2); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId prev = new(0, Keccak.EmptyTreeHash); + StateId tip = prev; + for (int i = 1; i <= 2; i++) + { + tip = new(i, Keccak.Compute($"sn{i}")); + SnapshotContent c = new(); + TreePath path = new(Keccak.Compute($"node{i}"), 4); + c.StateNodes[path] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, (byte)i]); + tier.ConvertToPersistedBase(new Snapshot(prev, tip, c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = tip; + } + + compactor.DoCompactSnapshot(tip); // block 2 is a CompactSize=2 boundary → WarmAddressColumnIndex path + + Assert.That(repo.TryLeasePersistedState(tip, SnapshotTier.PersistedSmallCompacted, out PersistedSnapshot? compacted), Is.True); + try + { + Assert.That(compacted!.To.BlockNumber, Is.EqualTo(2)); + TreePath probe = new(Keccak.Compute("node2"), 4); + Assert.That(compacted.TryLoadStateNodeRlp(probe, out _), Is.True, "state node must survive the no-address-column compaction"); + } + finally { compacted!.Dispose(); } + } + + /// + /// A sub-CompactSize intermediate merge lands in the + /// tier; a >CompactSize large-boundary merge lands in . + /// Each tier resolves only from its own bucket — a lease for the other tier at the same To misses. + /// + [Test] + public void DoCompactSnapshot_SplitsCompactedAndLargeCompactedByWindowWidth() + { + // CompactSize=4: block 2's window (0,2] spans 2 (< 4) → compacted; block 8's window (0,8] spans 8 (> 4) → large. + using FlatTestContainer tier = NewTier(compactSize: 4); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId prev = new(0, Keccak.EmptyTreeHash); + StateId[] states = new StateId[9]; + states[0] = prev; + for (int i = 1; i <= 8; i++) + { + states[i] = new StateId(i, Keccak.Compute($"s{i}")); + SnapshotContent c = new(); + c.Accounts[TestItem.Addresses[i - 1]] = Build.An.Account.WithBalance((UInt256)(i * 100)).TestObject; + tier.ConvertToPersistedBase(new Snapshot(prev, states[i], c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = states[i]; + } + + compactor.DoCompactSnapshot(states[2]); // sub-CompactSize intermediate + compactor.DoCompactSnapshot(states[8]); // >CompactSize large-boundary merge + + Assert.Multiple(() => + { + Assert.That(repo.TryLeasePersistedState(states[2], SnapshotTier.PersistedSmallCompacted, out PersistedSnapshot? compacted), Is.True, + "sub-CompactSize window must be a PersistedSmallCompacted snapshot"); + using (compacted) Assert.That(compacted!.To.BlockNumber, Is.EqualTo(2)); + + Assert.That(repo.TryLeasePersistedState(states[2], SnapshotTier.PersistedLargeCompacted, out _), Is.False, + "PersistedSmallCompacted must not resolve from the large-compacted bucket"); + + Assert.That(repo.TryLeasePersistedState(states[8], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? large), Is.True, + ">CompactSize window must be a PersistedLargeCompacted snapshot"); + using (large) Assert.That(large!.To.BlockNumber, Is.EqualTo(8)); + + Assert.That(repo.TryLeasePersistedState(states[8], SnapshotTier.PersistedSmallCompacted, out _), Is.False, + "PersistedLargeCompacted must not resolve from the compacted bucket"); + }); + } + + /// + /// A demoted sub-CompactSize intermediate that no wider compaction has covered keeps its real, + /// populated merged bloom — Demote only advises its pages cold. Regression for reverting the + /// AlwaysTrue-sentinel-on-demote behaviour. + /// + [Test] + public void Demote_KeepsIntermediateRealBloom() + { + // CompactSize=4: block 2's window (0,2] spans 2 (< 4) → demoted intermediate. No large boundary + // is compacted, so nothing shares over it. + using FlatTestContainer tier = NewTier(compactSize: 4); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId prev = new(0, Keccak.EmptyTreeHash); + StateId[] states = new StateId[3]; + states[0] = prev; + for (int i = 1; i <= 2; i++) + { + states[i] = new StateId(i, Keccak.Compute($"s{i}")); + SnapshotContent c = new(); + c.Accounts[TestItem.Addresses[i - 1]] = Build.An.Account.WithBalance((UInt256)(i * 100)).TestObject; + tier.ConvertToPersistedBase(new Snapshot(prev, states[i], c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = states[i]; + } + + compactor.DoCompactSnapshot(states[2]); // sub-CompactSize intermediate → demoted, keeps its real bloom + + Assert.That(repo.TryLeasePersistedState(states[2], SnapshotTier.PersistedSmallCompacted, out PersistedSnapshot? intermediate), Is.True); + using (intermediate) + { + Assert.Multiple(() => + { + // A real merge over the window's two accounts carries keys (Count > 0), unlike the + // Count==0 AlwaysTrue sentinel the reverted demote path installed. + Assert.That(intermediate!.Bloom.Count, Is.GreaterThan(0), "demoted intermediate must keep its real bloom"); + Assert.That(intermediate.TryGetAccount(TestItem.Addresses[0], out Account? a1), Is.True); + Assert.That(a1!.Balance, Is.EqualTo((UInt256)100)); + Assert.That(intermediate.TryGetAccount(TestItem.Addresses[1], out Account? a2), Is.True); + Assert.That(a2!.Balance, Is.EqualTo((UInt256)200)); + }); + } + } + + /// + /// A >CompactSize large-boundary merge adopts its own (superset) bloom across every persisted + /// snapshot fully contained in its (from, to] window — base, sub-CompactSize intermediate + /// and CompactSized alike. Each contained snapshot ends up reference-equal to the big merge's bloom (so + /// its own bloom is freed) and still reads back correctly. Regression for bloom sharing. + /// + [Test] + public void LargeBoundary_SharesBloomAcrossContainedSnapshots() + { + // CompactSize=4: block 8's window (0,8] spans 8 (> 4) → large boundary → shares its bloom. + using FlatTestContainer tier = NewTier(compactSize: 4); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId prev = new(0, Keccak.EmptyTreeHash); + StateId[] states = new StateId[9]; + states[0] = prev; + for (int i = 1; i <= 8; i++) + { + states[i] = new StateId(i, Keccak.Compute($"s{i}")); + SnapshotContent c = new(); + c.Accounts[TestItem.Addresses[i - 1]] = Build.An.Account.WithBalance((UInt256)(i * 100)).TestObject; + tier.ConvertToPersistedBase(new Snapshot(prev, states[i], c, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + prev = states[i]; + } + + compactor.DoCompactSnapshot(states[2]); // sub-CompactSize intermediate (small compacted) + compactor.DoCompactCompactSized(states[4]); // CompactSize boundary → CompactSized + compactor.DoCompactSnapshot(states[8]); // large boundary → shares its bloom across (0,8] + + Assert.That(repo.TryLeasePersistedState(states[8], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? big), Is.True); + using (big) + { + BloomFilter shared = big!.Bloom; + Assert.That(shared.Count, Is.GreaterThan(0), "the large merge keeps a real, populated bloom"); + + // The sub-CompactSize intermediate and the CompactSized both adopt the shared bloom. + AssertShares(repo, states[2], SnapshotTier.PersistedSmallCompacted, shared); + AssertShares(repo, states[4], SnapshotTier.PersistedCompactSized, shared); + + // Every contained base snapshot adopts the shared bloom and still resolves its account. + for (int i = 1; i <= 8; i++) + { + Assert.That(repo.TryLeasePersistedState(states[i], SnapshotTier.PersistedBase, out PersistedSnapshot? baseSnap), Is.True); + using (baseSnap) + { + Assert.That(ReferenceEquals(baseSnap!.Bloom, shared), Is.True, $"base {i} should share the big merge's bloom"); + Assert.That(baseSnap.TryGetAccount(TestItem.Addresses[i - 1], out Account? a), Is.True, $"account from block {i} must still resolve"); + Assert.That(a!.Balance, Is.EqualTo((UInt256)(i * 100))); + } + } + } + + static void AssertShares(SnapshotRepository repo, StateId at, SnapshotTier tier, BloomFilter shared) + { + Assert.That(repo.TryLeasePersistedState(at, tier, out PersistedSnapshot? s), Is.True, $"{tier} at {at.BlockNumber} must exist"); + using (s) + Assert.That(ReferenceEquals(s!.Bloom, shared), Is.True, $"{tier} at {at.BlockNumber} should share the big merge's bloom"); + } + } + + /// + /// A snapshot extending below the big merge's from (its keys are not a subset of the merge's + /// window) must NOT adopt the merge's bloom — sharing it would yield false negatives. Builds a [0,8] + /// large skip-pointer, then a [4,16] big merge clamped to persistence block 4, and asserts the [0,8] + /// snapshot keeps its own bloom. + /// + [Test] + public void LargeBoundary_DoesNotShareBloomIntoSnapshotExtendingBelowFrom() + { + // CompactSize=1 makes every block a boundary; MaxCompactSize=16 so block 16's window is [0, 16]. + using FlatTestContainer tier = new( + arenaFileSizeBytes: 256 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton( + ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 1, PersistedSnapshotMaxCompactSize = 16 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId[] states = new StateId[17]; + states[0] = new StateId(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= 16; i++) + states[i] = new StateId(i, Keccak.Compute($"{i}")); + + // Build base [0..8], then the [0,8] large-compacted skip-pointer. + for (int i = 1; i <= 8; i++) + BuildBase(tier, states, i); + compactor.DoCompactSnapshot(states[8], persistedBlockNumber: 0); + + // Build base [9..16], then the [0,16] window clamped to persistence point 4 → big merge is [4,16]. + for (int i = 9; i <= 16; i++) + BuildBase(tier, states, i); + compactor.DoCompactSnapshot(states[16], persistedBlockNumber: 4); + + Assert.That(repo.TryLeasePersistedState(states[16], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? big), Is.True); + using (big) + { + Assert.That(big!.From.BlockNumber, Is.EqualTo(4), "precondition: the big merge is clamped to From=4"); + Assert.That(repo.TryLeasePersistedState(states[8], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? below), Is.True); + using (below) + Assert.That(ReferenceEquals(below!.Bloom, big.Bloom), Is.False, + "a [0,8] snapshot extending below from=4 must keep its own bloom"); + } + } + + /// + /// Sizing the merged bloom must count a filter shared by several sources only once. A large + /// compaction adopts its (superset) bloom across the snapshots it contains, so a later compaction + /// can assemble several sources that all point at that one filter — each reporting its whole-window + /// key count. Summing per source inflates bloomCapacity (and thus the merged filter) by the + /// number of sharers. Builds a [0,8] large skip-pointer that shares its bloom across bases [1,8], + /// then a [4,16] merge clamped to persistence 4 assembling bases [5,16] — bases [5,8] share the + /// [0,8] bloom — and asserts the merged filter's capacity equals the deduplicated source-bloom sum, + /// not the inflated per-source sum. + /// + [Test] + public void LargeBoundary_MergedBloomCapacity_DeduplicatesSharedSourceBloom() + { + // CompactSize=1 makes every block a boundary; MaxCompactSize=16 so block 16's window is [0, 16]. + using FlatTestContainer tier = new( + arenaFileSizeBytes: 256 * 1024, + blobFileSizeBytes: 4 * 1024 * 1024, + configure: b => b.AddSingleton( + ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 1, PersistedSnapshotMaxCompactSize = 16 }, 0))); + SnapshotRepository repo = tier.Repository; + PersistedSnapshotCompactor compactor = tier.Compactor; + + StateId[] states = new StateId[17]; + states[0] = new StateId(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= 16; i++) + states[i] = new StateId(i, Keccak.Compute($"{i}")); + + // Build base [0..8], then the [0,8] large-compacted skip-pointer — it shares its bloom over [1,8]. + for (int i = 1; i <= 8; i++) + BuildBase(tier, states, i); + compactor.DoCompactSnapshot(states[8], persistedBlockNumber: 0); + + // Build base [9..16]; the [0,16] window clamps to persistence 4, so the merge spans [4,16] and + // assembles bases [5,16] — bases [5,8] still carry the shared [0,8] bloom. + for (int i = 9; i <= 16; i++) + BuildBase(tier, states, i); + + // Capture the source blooms the merge will see, BEFORE it runs and replaces them with its own + // shared bloom. dedupedSum counts each distinct filter once (the [0,8] bloom across bases [5,8]); + // naiveSum is the buggy per-source sum that double-counts it. + long dedupedSum = 0, naiveSum = 0; + HashSet distinct = []; + for (int i = 5; i <= 16; i++) + { + Assert.That(repo.TryLeasePersistedState(states[i], SnapshotTier.PersistedBase, out PersistedSnapshot? src), Is.True); + using (src) + { + naiveSum += src!.Bloom.Count; + if (distinct.Add(src.Bloom)) dedupedSum += src.Bloom.Count; + } + } + Assert.That(distinct.Count, Is.LessThan(12), "precondition: bases [5,8] must share one bloom, so fewer than 12 distinct filters"); + Assert.That(dedupedSum, Is.LessThan(naiveSum), "precondition: the shared bloom is double-counted by a naive per-source sum"); + + compactor.DoCompactSnapshot(states[16], persistedBlockNumber: 4); + + Assert.That(repo.TryLeasePersistedState(states[16], SnapshotTier.PersistedLargeCompacted, out PersistedSnapshot? big), Is.True); + using (big) + { + Assert.That(big!.From.BlockNumber, Is.EqualTo(4), "precondition: the merge is clamped to From=4"); + Assert.That(big.Bloom.Capacity, Is.EqualTo(dedupedSum), + "merged bloom capacity must count the shared source bloom once, not once per sharer"); + } + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotPerAddressTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotPerAddressTests.cs new file mode 100644 index 000000000000..976487d9aafe --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotPerAddressTests.cs @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Core.Extensions; +using Nethermind.Core.Test.Builders; +using Nethermind.Serialization.Rlp; +using Nethermind.State.Flat.PersistedSnapshots; +using NUnit.Framework; +using AccountState = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotPerAddress.AccountState; +using SelfDestructState = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotPerAddress.SelfDestructState; + +namespace Nethermind.State.Flat.Test; + +// The internal nested enums can't appear in public test-method signatures (CS0051), so the +// parameterized cases run via private helpers driven from a couple of public [Test] methods. +[TestFixture] +public class PersistedSnapshotPerAddressTests +{ + private static readonly SelfDestructState[] AllSelfDestructStates = + [SelfDestructState.None, SelfDestructState.Destructed, SelfDestructState.New]; + + // Every (account state × self-destruct state) the codec must round-trip. Absent+None is not + // emitted by the builder (storage-only addresses write nothing) but the codec stays agnostic. + [Test] + public void Encode_then_decode_round_trips_all_states() + { + Account present = Build.An.Account.WithBalance(12345).WithNonce(7).TestObject; + Account presentWithCodeAndStorage = Build.An.Account.WithBalance(0).WithNonce(0) + .WithCode([0x60, 0x00]).WithStorageRoot(Keccak.Compute("storage")).TestObject; + + Assert.Multiple(() => + { + foreach (SelfDestructState sd in AllSelfDestructStates) + { + AssertRoundTrip(AccountState.Present, present, sd); + AssertRoundTrip(AccountState.Deleted, null, sd); + AssertRoundTrip(AccountState.Absent, null, sd); + } + AssertRoundTrip(AccountState.Present, presentWithCodeAndStorage, SelfDestructState.New); + }); + } + + // Item 0 discriminates Deleted (single byte 0x00) from Absent (empty string 0x80) positionally; + // item 1 is the self-destruct int (None=0 → 0x80, Destructed=1 → 0x01, New=2 → 0x02). 0xc2 is the + // outer two-byte-content list header. + [Test] + public void NonPresent_account_item_encodes_expected_bytes() => Assert.Multiple(() => + { + Assert.That(Encode(AccountState.Deleted, null, SelfDestructState.None), Is.EqualTo(Bytes.FromHexString("c20080"))); + Assert.That(Encode(AccountState.Absent, null, SelfDestructState.None), Is.EqualTo(Bytes.FromHexString("c28080"))); + Assert.That(Encode(AccountState.Deleted, null, SelfDestructState.Destructed), Is.EqualTo(Bytes.FromHexString("c20001"))); + Assert.That(Encode(AccountState.Absent, null, SelfDestructState.New), Is.EqualTo(Bytes.FromHexString("c28002"))); + }); + + private static void AssertRoundTrip(AccountState state, Account? account, SelfDestructState sd) + { + byte[] value = Encode(state, account, sd); + string label = $"{state}+{sd}"; + + PersistedSnapshotPerAddress.Decode(value, out AccountState decodedState, out Account? decodedAccount, out SelfDestructState decodedSd); + Assert.That(decodedState, Is.EqualTo(state), label); + Assert.That(decodedSd, Is.EqualTo(sd), label); + if (state == AccountState.Present) + { + Assert.That(decodedAccount, Is.Not.Null, label); + Assert.That(decodedAccount!.Balance, Is.EqualTo(account!.Balance), label); + Assert.That(decodedAccount.Nonce, Is.EqualTo(account.Nonce), label); + Assert.That(decodedAccount.StorageRoot, Is.EqualTo(account.StorageRoot), label); + Assert.That(decodedAccount.CodeHash, Is.EqualTo(account.CodeHash), label); + } + else + { + Assert.That(decodedAccount, Is.Null, label); + } + + // The split read helpers must agree with the combined decode. + Assert.That(PersistedSnapshotPerAddress.TryDecodeAccount(value, out Account? viaTry), Is.EqualTo(state != AccountState.Absent), label); + Assert.That(viaTry?.Balance, Is.EqualTo(decodedAccount?.Balance), label); + Assert.That(PersistedSnapshotPerAddress.DecodeSelfDestructState(value), Is.EqualTo(sd), label); + Assert.That(PersistedSnapshotPerAddress.DecodeSelfDestruct(value), Is.EqualTo(PersistedSnapshotPerAddress.ToFlag(sd)), label); + } + + private static byte[] Encode(AccountState state, Account? account, SelfDestructState sd) + { + byte[] buf = new byte[256]; + RlpStream stream = new(buf); + int len = PersistedSnapshotPerAddress.Encode(stream, state, account, sd); + return buf[..len]; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotRepositoryTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotRepositoryTests.cs new file mode 100644 index 000000000000..0b124e36fb43 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotRepositoryTests.cs @@ -0,0 +1,613 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.IO; +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Core.Test.Builders; +using Nethermind.Db; +using Nethermind.Int256; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.Trie; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +[TestFixture] +public class PersistedSnapshotRepositoryTests +{ + private string _testDir = null!; + private ResourcePool _pool = null!; + + [SetUp] + public void SetUp() + { + _testDir = Path.Combine(Path.GetTempPath(), $"nethermind_test_{Guid.NewGuid():N}"); + Directory.CreateDirectory(_testDir); + _pool = new ResourcePool(new FlatDbConfig()); + } + + [TearDown] + public void TearDown() + { + if (Directory.Exists(_testDir)) + Directory.Delete(_testDir, recursive: true); + } + + private Snapshot CreateTestSnapshot(StateId from, StateId to, Address? account = null, UInt256 balance = default) + { + SnapshotContent content = new(); + if (account is not null) + content.Accounts[account] = Build.An.Account.WithBalance(balance == 0 ? 1000 : balance).TestObject; + return new Snapshot(from, to, content, _pool, ResourcePool.Usage.MainBlockProcessing); + } + + [Test] + public void PersistSnapshot_And_Query() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + Snapshot snap = CreateTestSnapshot(s0, s1, TestItem.AddressA); + + tier.ConvertToPersistedBase(snap).Dispose(); + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(1)); + + Assert.That(repo.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? persisted), Is.True); + Assert.That(persisted!.From, Is.EqualTo(s0)); + Assert.That(persisted.To, Is.EqualTo(s1)); + Assert.That(persisted.TryGetAccount(TestItem.AddressA, out Account? decoded), Is.True); + Assert.That(decoded!.Balance, Is.EqualTo((UInt256)1000)); + persisted.Dispose(); + } + + /// + /// Regression: an address with 256k sequential storage slots fills four fully-dense + /// 30-byte slot-prefix groups (65536 slots each). The builder writes the per-address + /// slot column through ArenaBufferWriter (see ), + /// and a full prefix group's inner sub-slot table exceeds that writer's 1 MiB buffer — so the + /// single BlockBuilder.Add for the oversized prefix-group value must still round-trip. + /// + [Test] + public void ConvertSnapshot_SequentialSlotsAcrossDensePrefixGroups_RoundTrips() + { + // 64 MiB shared arena: a 256k-slot snapshot (~10 MiB) stays below the 512 MiB + // dedicated-arena threshold, so it must fit within a single shared arena file. + using FlatTestContainer tier = new(arenaFileSizeBytes: 64 * 1024 * 1024, blobFileSizeBytes: 4 * 1024 * 1024); + SnapshotRepository repo = tier.Repository; + + const int slotCount = 256 * 1024; + SnapshotContent content = new(); + TestFixtureHelpers.AddSequentialSlots(content, TestItem.AddressA, firstSlot: 1, count: slotCount); + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("seq-slots")); + using PersistedSnapshot persisted = tier.ConvertToPersistedBase( + new Snapshot(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing)); + + // Probe slots spanning multiple prefix groups (group boundaries fall on multiples of 65536). + foreach (int probe in new[] { 1, 65535, 65536, 131072, slotCount }) + { + SlotValue slot = default; + Assert.That(persisted.TryGetSlot(TestItem.AddressA, (UInt256)probe, ref slot), Is.True, $"slot {probe} missing"); + Assert.That(slot.AsReadOnlySpan.SequenceEqual(TestFixtureHelpers.SequentialSlotValue(probe)), Is.True, + $"slot {probe} value mismatch"); + } + } + + [Test] + public void NewerSnapshot_OverridesOlderValue() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + + TreePath path = new(Keccak.Compute("path"), 4); + byte[] rlp1 = [0xC0]; + byte[] rlp2 = [0xC1, 0x80]; + + SnapshotContent content1 = new(); + content1.StateNodes[path] = new TrieNode(NodeType.Leaf, rlp1); + Snapshot snap1 = new(s0, s1, content1, _pool, ResourcePool.Usage.MainBlockProcessing); + + SnapshotContent content2 = new(); + content2.StateNodes[path] = new TrieNode(NodeType.Leaf, rlp2); + Snapshot snap2 = new(s1, s2, content2, _pool, ResourcePool.Usage.MainBlockProcessing); + + tier.ConvertToPersistedBase(snap1).Dispose(); + tier.ConvertToPersistedBase(snap2).Dispose(); + + Assert.That(repo.TryLeasePersistedState(s2, SnapshotTier.PersistedBase, out PersistedSnapshot? newest), Is.True); + Assert.That(newest!.TryLoadStateNodeRlp(path, out byte[]? result), Is.True); + Assert.That(result, Is.EqualTo(rlp2)); + newest.Dispose(); + } + + [Test] + public void LoadFromCatalog_RestoresSnapshots() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + MemDb catalogDb = new(); + + using (FlatTestContainer tier1 = new(arenaFileSizeBytes: 4096, baseDbPath: _testDir, catalogDb: catalogDb)) + { + SnapshotRepository repo = tier1.Repository; + Snapshot snap = CreateTestSnapshot(s0, s1, TestItem.AddressA); + tier1.ConvertToPersistedBase(snap).Dispose(); + } + + using (FlatTestContainer tier2 = new(arenaFileSizeBytes: 4096, baseDbPath: _testDir, catalogDb: catalogDb)) + { + SnapshotRepository repo = tier2.Repository; + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(1)); + Assert.That(repo.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? snapshot), Is.True); + snapshot!.Dispose(); + } + } + + [Test] + public void ConvertSnapshot_RoundTrip_AllDataCategories() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + + Address acctAddr = TestItem.AddressA; + Address selfDestructAddr = TestItem.AddressB; + Address storageAddr = TestItem.AddressC; + UInt256 slotIndex = (UInt256)42; + byte[] slotBytes = new byte[32]; + slotBytes[31] = 0xAB; + slotBytes[30] = 0xCD; + SlotValue slotValue = new(slotBytes); + + TreePath statePath = new(Keccak.Compute("state_path"), 4); + byte[] stateRlp = [0xC2, 0x80, 0x80]; + Hash256 storageTrieAddr = Keccak.Compute("storage_trie_addr"); + TreePath storagePath = new(Keccak.Compute("storage_path"), 6); + byte[] storageRlp = [0xC1, 0x80]; + + SnapshotContent content = new(); + content.Accounts[acctAddr] = Build.An.Account.WithBalance(500).TestObject; + content.Storages[(storageAddr, slotIndex)] = slotValue; + content.SelfDestructedStorageAddresses[selfDestructAddr] = false; + content.StateNodes[statePath] = new TrieNode(NodeType.Leaf, stateRlp); + content.StorageNodes[(storageTrieAddr, storagePath)] = new TrieNode(NodeType.Branch, storageRlp); + Snapshot snap = new(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing); + + tier.ConvertToPersistedBase(snap).Dispose(); + + Assert.That(repo.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? persisted), Is.True); + using PersistedSnapshot _ = persisted!; + + Assert.That(persisted!.TryGetAccount(acctAddr, out Account? account), Is.True); + Assert.That(account, Is.Not.Null); + Assert.That(account!.Balance, Is.EqualTo((UInt256)500)); + + SlotValue readSlot = default; + Assert.That(persisted.TryGetSlot(storageAddr, slotIndex, ref readSlot), Is.True); + Assert.That(readSlot.AsReadOnlySpan.ToArray(), Is.EqualTo(slotBytes)); + + Assert.That(persisted.TryGetSelfDestructFlag(selfDestructAddr), Is.Not.Null); + + Assert.That(persisted.TryLoadStateNodeRlp(statePath, out byte[]? stateResult), Is.True); + Assert.That(stateResult, Is.EqualTo(stateRlp)); + + Assert.That(persisted.TryLoadStorageNodeRlp(storageTrieAddr.ValueHash256, storagePath, out byte[]? storageResult), Is.True); + Assert.That(storageResult, Is.EqualTo(storageRlp)); + } + + [Test] + public void RemoveStatesUntil_RemovesOldSnapshots() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + StateId s3 = new(3, Keccak.Compute("3")); + + Snapshot snap1 = CreateTestSnapshot(s0, s1, TestItem.AddressA); + Snapshot snap2 = CreateTestSnapshot(s1, s2, TestItem.AddressB); + Snapshot snap3 = CreateTestSnapshot(s2, s3, TestItem.AddressC); + + tier.ConvertToPersistedBase(snap1).Dispose(); + tier.ConvertToPersistedBase(snap2).Dispose(); + tier.ConvertToPersistedBase(snap3).Dispose(); + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(3)); + + // Remove states until block 2 (removes snap1 with To=1) + repo.RemovePersistedStatesUntil(2); + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(2)); + } + + [TestCase(100)] + [TestCase(1000)] + public void ManyBaseSnapshots_ShareUnderlyingFiles(int count) + { + // Regression for the old "Blob arena id space exhausted (65535 arenas per tier)" + // bug: ids were minted per base-conversion call, so 65k base + // snapshots used 65k blob arena ids. Per-file ids pack many writers into one file — + // file count stays bounded under steady state. + using FlatTestContainer tier = new(arenaFileSizeBytes: 64 * 1024); + SnapshotRepository repo = tier.Repository; + + StateId prev = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= count; i++) + { + StateId next = new(i, Keccak.Compute($"s{i}")); + Snapshot snap = CreateTestSnapshot(prev, next, TestItem.Addresses[i % TestItem.Addresses.Length]); + tier.ConvertToPersistedBase(snap).Dispose(); + prev = next; + } + + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(count)); + // Files stay packed: bounded by max file size / typical write size, not by snapshot count. + int blobFileCount = Directory.GetFiles(Path.Combine(tier.BaseDbPath, "persisted_snapshot", "blob"), "blob_*.bin").Length; + Assert.That(blobFileCount, Is.LessThan(count), + "expected many base snapshots to share blob arena files"); + } + + [TestCase(true, TestName = "ConvertSnapshot_RecordsBlobRange(with trie nodes)")] + [TestCase(false, TestName = "ConvertSnapshot_RecordsBlobRange(no trie nodes)")] + public void ConvertSnapshot_RecordsBlobRange(bool withTrieNode) + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 64 * 1024); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + SnapshotContent content = new(); + content.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(1000).TestObject; + if (withTrieNode) + content.StateNodes[new TreePath(Keccak.Compute("p"), 4)] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x80]); + + using PersistedSnapshot persisted = tier.ConvertToPersistedBase( + new Snapshot(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing)); + + if (withTrieNode) + { + Assert.That(persisted.BlobRange.IsEmpty, Is.False, "a base snapshot with trie nodes records a non-empty blob range"); + Assert.That(persisted.BlobRange.Length, Is.GreaterThan(0)); + } + else + { + Assert.That(persisted.BlobRange.IsEmpty, Is.True, "a base snapshot with no trie nodes has no blob region"); + } + } + + [TestCase(true, TestName = "BlobRange_SurvivesReloadViaMetadata(with trie nodes)")] + [TestCase(false, TestName = "BlobRange_SurvivesReloadViaMetadata(no trie nodes)")] + public void BlobRange_SurvivesReloadViaMetadata(bool withTrieNode) + { + // The blob range lives in the snapshot's own metadata table (blob_range key), not the + // catalog, so it must round-trip a restart: read back by the PersistedSnapshot ctor. + MemDb catalogDb = new(); + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + + using (FlatTestContainer tier1 = new(arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb)) + { + SnapshotRepository repo1 = tier1.Repository; + SnapshotContent content = new(); + content.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(1000).TestObject; + if (withTrieNode) + content.StateNodes[new TreePath(Keccak.Compute("p"), 4)] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x80]); + tier1.ConvertToPersistedBase( + new Snapshot(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + } + + using FlatTestContainer tier2 = new(arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb); + SnapshotRepository repo2 = tier2.Repository; + + Assert.That(repo2.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? reloaded), Is.True); + using (reloaded) + Assert.That(reloaded!.BlobRange.IsEmpty, Is.EqualTo(!withTrieNode), + "the base's blob range must round-trip a restart via its metadata table"); + } + + [Test] + public void LeaseBaseSnapshotsInRange_ReturnsBasesTilingWindow() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 64 * 1024); + SnapshotRepository repo = tier.Repository; + + StateId[] ids = new StateId[4]; + ids[0] = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i < 4; i++) + { + ids[i] = new(i, Keccak.Compute($"s{i}")); + tier.ConvertToPersistedBase( + CreateTestSnapshot(ids[i - 1], ids[i], TestItem.Addresses[i])).Dispose(); + } + + using PersistedSnapshotList bases = repo.LeaseBaseSnapshotsInRange(ids[0], ids[3]); + Assert.That(bases.Count, Is.EqualTo(3)); + // Walk-back order: newest first. + Assert.That(bases[0].To, Is.EqualTo(ids[3])); + Assert.That(bases[^1].From, Is.EqualTo(ids[0])); + } + + /// + /// Regression for the ReconstructBloom pass inside LoadFromCatalog: after a restart, a bloom is + /// rebuilt only for the widest snapshot covering each range and shared across it. The CompactSized + /// covering (0, 4] holds every address written across the four bases, and each contained base adopts + /// that one wide bloom (the same instance) rather than the AlwaysTrue placeholder or its own. + /// + [Test] + public void LoadFromCatalog_ReconstructsBloom_SharedFromWidest() + { + StateId[] ids = new StateId[5]; + ids[0] = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= 4; i++) ids[i] = new(i, Keccak.Compute($"s{i}")); + + MemDb catalogDb = new(); + + // Session 1: 4 bases + a CompactSize=4 CompactSized covering all 4 of them. + using (FlatTestContainer tier1 = new( + arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 4 }, 0)))) + { + SnapshotRepository repo = tier1.Repository; + for (int i = 1; i <= 4; i++) + tier1.ConvertToPersistedBase( + CreateTestSnapshot(ids[i - 1], ids[i], TestItem.Addresses[i - 1])).Dispose(); + + tier1.Compactor.DoCompactCompactSized(ids[4]); // CompactSized at To=4 covering (0, 4] + } + + // Session 2: reload. LoadFromCatalog now auto-calls ReconstructBloom. + using FlatTestContainer tier2 = new(arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb); + SnapshotRepository repo2 = tier2.Repository; + + // With the v7 (To, depth)-keyed catalog the base at ids[4] survives alongside the + // CompactSized at the same To — both buckets must lease independently. + Assert.That(repo2.TryLeasePersistedState(ids[4], SnapshotTier.PersistedCompactSized, out PersistedSnapshot? compactSizedAt4), Is.True); + using (compactSizedAt4) + { + // The widest snapshot covering (0, 4] — the chain's starting snapshot. Its bloom is rebuilt + // from its own merged table and holds every address written across the four bases. + BloomFilter shared = compactSizedAt4!.Bloom; + Assert.That(shared.Count, Is.GreaterThan(0), + "ReconstructBloom must have built a real bloom for the widest (starting) snapshot"); + Assert.That(compactSizedAt4.From.BlockNumber, Is.EqualTo(0)); + Assert.That(compactSizedAt4.To.BlockNumber, Is.EqualTo(4)); + for (int i = 1; i <= 4; i++) + { + ulong key = PersistedSnapshotBloomBuilder.AddressKey(TestItem.Addresses[i - 1]); + Assert.That(shared.MightContain(key), Is.True, + $"AddressKey for base {i} must be in the widest snapshot's merged bloom"); + } + + // Each contained base adopts the widest snapshot's bloom (the same instance), not its own. + for (int i = 1; i <= 4; i++) + { + Assert.That(repo2.TryLeasePersistedState(ids[i], SnapshotTier.PersistedBase, out PersistedSnapshot? baseAt), Is.True, + $"base at ids[{i}] must round-trip under v7"); + using (baseAt) + Assert.That(ReferenceEquals(baseAt!.Bloom, shared), Is.True, + $"base {i} must share the widest snapshot's bloom"); + } + } + } + + /// + /// Regression for the v7 (To, depth)-keyed catalog: before v7, a CompactSized at the + /// same To as a base overwrote the base's catalog entry, so a restart would lose the + /// base. With v7 both round-trip independently — SnapshotCount on reload equals the + /// number of Add calls in the prior session. + /// + [Test] + public void LoadFromCatalog_RoundTripsBaseAndCompactSizedAtSameTo() + { + StateId[] ids = new StateId[5]; + ids[0] = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= 4; i++) ids[i] = new(i, Keccak.Compute($"s{i}")); + + MemDb catalogDb = new(); + + using (FlatTestContainer tier1 = new( + arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 4 }, 0)))) + { + SnapshotRepository repo = tier1.Repository; + for (int i = 1; i <= 4; i++) + tier1.ConvertToPersistedBase( + CreateTestSnapshot(ids[i - 1], ids[i], TestItem.Addresses[i - 1])).Dispose(); + + tier1.Compactor.DoCompactCompactSized(ids[4]); + + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(5), "session 1 must hold 4 bases + 1 CompactSized"); + } + + using FlatTestContainer tier2 = new(arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb); + SnapshotRepository repo2 = tier2.Repository; + + Assert.That(repo2.PersistedSnapshotCount, Is.EqualTo(5), + "all five snapshots (4 bases + 1 CompactSized at the last base's To) must round-trip under v7"); + for (int i = 1; i <= 4; i++) + { + Assert.That(repo2.TryLeasePersistedState(ids[i], SnapshotTier.PersistedBase, out PersistedSnapshot? b), Is.True, + $"base at ids[{i}] must survive reload"); + b!.Dispose(); + } + Assert.That(repo2.TryLeasePersistedState(ids[4], SnapshotTier.PersistedCompactSized, out PersistedSnapshot? compactSized), Is.True); + compactSized!.Dispose(); + } + + /// + /// Exercise the parallel-then-serial split in LoadFromCatalog: build enough + /// snapshots in session 1 to spread across multiple + /// partitions, reload in session 2, and verify the parallel construction + serial + /// sorted-set rebuild preserves: snapshot count, per-bucket leasability, ordered-id + /// invariants (the From/To chain reachable via LeaseBaseSnapshotsInRange), and the + /// ReconstructBloom end-state (snapshots in a compacted range share that range's bloom). + /// Stays below ParallelLoadThreshold so the progress logger is bypassed — + /// that codepath is a one-line gate we trust by inspection. + /// + [Test] + public void LoadFromCatalog_Parallel_PreservesOrderingAndDicts() + { + const int N = 32; + StateId[] ids = new StateId[N + 1]; + ids[0] = new(0, Keccak.EmptyTreeHash); + for (int i = 1; i <= N; i++) ids[i] = new(i, Keccak.Compute($"s{i}")); + + MemDb catalogDb = new(); + + using (FlatTestContainer tier1 = new( + arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb, + configure: b => b.AddSingleton(ScheduleHelper.CreateWithOffset(new FlatDbConfig { CompactSize = 8 }, 0)))) + { + SnapshotRepository repo = tier1.Repository; + for (int i = 1; i <= N; i++) + tier1.ConvertToPersistedBase( + CreateTestSnapshot(ids[i - 1], ids[i], TestItem.Addresses[(i - 1) % TestItem.Addresses.Length])).Dispose(); + + // Throw in two CompactSized snapshots (CompactSize=8) at boundaries 8 and 16 so the + // catalog has multi-bucket entries that exercise the bucket-routing branch + // in the parallel LoadSnapshot. + tier1.Compactor.DoCompactCompactSized(ids[8]); + tier1.Compactor.DoCompactCompactSized(ids[16]); + } + + using FlatTestContainer tier2 = new(arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb); + SnapshotRepository repo2 = tier2.Repository; + + Assert.That(repo2.PersistedSnapshotCount, Is.EqualTo(N + 2)); + for (int i = 1; i <= N; i++) + { + Assert.That(repo2.TryLeasePersistedState(ids[i], SnapshotTier.PersistedBase, out PersistedSnapshot? b), Is.True, $"base ids[{i}] missing"); + b!.Dispose(); + } + Assert.That(repo2.TryLeasePersistedState(ids[8], SnapshotTier.PersistedCompactSized, out PersistedSnapshot? p8), Is.True); + p8!.Dispose(); + Assert.That(repo2.TryLeasePersistedState(ids[16], SnapshotTier.PersistedCompactSized, out PersistedSnapshot? p16), Is.True); + p16!.Dispose(); + + // Ordered-id invariant: the bases tile the whole (0, N] window via their From chain. + // Catches a missing or mis-routed sorted-set entry. + using (PersistedSnapshotList chain = repo2.LeaseBaseSnapshotsInRange(ids[0], ids[N])) + Assert.That(chain.Count, Is.EqualTo(N), "every base must be reachable via the From chain"); + + // Bloom end-state: a bloom is rebuilt for the widest snapshot covering each range and shared + // across it — base ids[1] adopts the CompactSized covering (0, 8] rather than carrying its own. + Assert.That(repo2.TryLeasePersistedState(ids[8], SnapshotTier.PersistedCompactSized, out PersistedSnapshot? compactSizedAt8), Is.True); + using (compactSizedAt8) + { + Assert.That(compactSizedAt8!.Bloom.Count, Is.GreaterThan(0), "CompactSized at ids[8] must have a real bloom"); + Assert.That(repo2.TryLeasePersistedState(ids[1], SnapshotTier.PersistedBase, out PersistedSnapshot? baseAt1), Is.True); + using (baseAt1) + Assert.That(ReferenceEquals(baseAt1!.Bloom, compactSizedAt8.Bloom), Is.True, + "base ids[1] must share the CompactSized's bloom"); + } + } + + // With bloom disabled (bits-per-key 0) the loader's Convert path uses the AlwaysTrue + // sentinel and ReconstructBloom returns early on restart — data must still survive. + [Test] + public void LoadFromCatalog_BloomDisabled_SkipsReconstructionButDataSurvives() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("nb1")); + MemDb catalogDb = new(); + + using (FlatTestContainer tier1 = new( + config: new FlatDbConfig { PersistedSnapshotBloomBitsPerKey = 0 }, + arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb)) + { + tier1.ConvertToPersistedBase(CreateTestSnapshot(s0, s1, TestItem.AddressA)).Dispose(); + } + + using FlatTestContainer tier2 = new( + config: new FlatDbConfig { PersistedSnapshotBloomBitsPerKey = 0 }, + arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir, catalogDb: catalogDb); + + Assert.That(tier2.Repository.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? p), Is.True); + using (p) + { + Assert.That(p!.Bloom.Count, Is.EqualTo(0), "bloom disabled → AlwaysTrue sentinel, no reconstruction"); + Assert.That(p.TryGetAccount(TestItem.AddressA, out _), Is.True, "data must survive restart with bloom disabled"); + } + } + + // With validation enabled, Convert runs PersistedSnapshotUtils.ValidatePersistedSnapshot + // on the freshly written base; a valid snapshot must convert and round-trip without throwing. + [Test] + public void ConvertToPersistedBase_WithValidationEnabled_RoundTrips() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("val1")); + + using FlatTestContainer tier = new( + config: new FlatDbConfig { ValidatePersistedSnapshot = true }, + arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir); + + using PersistedSnapshot p = tier.ConvertToPersistedBase(CreateTestSnapshot(s0, s1, TestItem.AddressA, 77)); + Assert.That(p.TryGetAccount(TestItem.AddressA, out Account? acc), Is.True); + Assert.That(acc!.Balance, Is.EqualTo((UInt256)77)); + } + + // A converted base records a contiguous trie-RLP blob run, so its blob-range advise calls + // hit the non-empty fadvise branch (a no-op against the test arena, but must not throw). + [Test] + public void AdviseBlobRange_OnConvertedBaseWithTrieNodes_DoesNotThrow() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("blob1")); + using FlatTestContainer tier = new(arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir); + + SnapshotContent content = new(); + Nethermind.Trie.TreePath path = new(Keccak.Compute("bp"), 4); + content.StateNodes[path] = new Nethermind.Trie.TrieNode(Nethermind.Trie.NodeType.Leaf, [0xC2, 0x80, 0x80]); + using PersistedSnapshot p = tier.ConvertToPersistedBase( + new Snapshot(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing)); + + Assert.DoesNotThrow(() => p.AdviseWillNeedBlobRange()); + Assert.DoesNotThrow(() => p.AdviseDontNeedBlobRange()); + Assert.That(p.TryLoadStateNodeRlp(path, out _), Is.True); + } + + // End-to-end-ish read-through: a base converted with a REAL bloom (default config), + // wrapped in a PersistedSnapshotStack, resolves a present account/slot and skips absent + // addresses — exercising the stack's real-bloom gate (MightContain == false → continue). + [Test] + public void Stack_RealBloom_AdmitsPresentSkipsAbsentAddresses() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("rb1")); + using FlatTestContainer tier = new(arenaFileSizeBytes: 64 * 1024, baseDbPath: _testDir); + + SnapshotContent content = new(); + content.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(123).TestObject; + byte[] slot = new byte[32]; slot[31] = 0x55; + content.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(slot); + PersistedSnapshot persisted = tier.ConvertToPersistedBase( + new Snapshot(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing)); + + PersistedSnapshotList list = new(1) { persisted }; + using PersistedSnapshotStack stack = new(list, recordDetailedMetrics: false); + + Assert.That(stack.TryGetAccount(TestItem.AddressA, out Account? a), Is.True); + Assert.That(a!.Balance, Is.EqualTo((UInt256)123)); + long start = System.Diagnostics.Stopwatch.GetTimestamp(); + Assert.That(stack.TryGetSlot(TestItem.AddressA, (UInt256)1, -1, start, out byte[]? sv), Is.True); + Assert.That(sv![^1], Is.EqualTo((byte)0x55)); + + // Absent addresses: the real bloom excludes them (or the snapshot misses) → fall through. + foreach (Address absent in new[] { TestItem.AddressB, TestItem.AddressC, TestItem.AddressD, TestItem.AddressE, TestItem.AddressF }) + Assert.That(stack.TryGetAccount(absent, out _), Is.False, $"{absent} must not resolve"); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotTests.cs new file mode 100644 index 000000000000..0b4eae57bf74 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/PersistedSnapshotTests.cs @@ -0,0 +1,895 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.Buffers.Binary; +using System.Collections.Generic; +using System.IO; +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Core.Test.Builders; +using Nethermind.Db; +using Nethermind.Int256; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.Trie; +using NUnit.Framework; +using WholeReadScanner = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotScanner< + Nethermind.State.Flat.PersistedSnapshots.Storage.WholeReadSession, + Nethermind.State.Flat.PersistedSnapshots.Storage.WholeReadSessionReader, + Nethermind.State.Flat.Io.NoOpPin>; + +namespace Nethermind.State.Flat.Test; + +[TestFixture] +public class PersistedSnapshotTests +{ + private ResourcePool _resourcePool = null!; + private ArenaManager _memArena = null!; + private string _memArenaDir = null!; + private BlobArenaManager _blobs = null!; + private string _blobsDir = null!; + + [SetUp] + public void SetUp() + { + _resourcePool = new ResourcePool(new FlatDbConfig()); + _memArenaDir = Path.Combine(Path.GetTempPath(), $"nm-pstest-arena-{Guid.NewGuid():N}"); + _memArena = TestFixtureHelpers.CreateArenaManager(_memArenaDir); + _blobsDir = Path.Combine(Path.GetTempPath(), $"nm-pstest-blobs-{Guid.NewGuid():N}"); + _blobs = new BlobArenaManager(_blobsDir, 4L * 1024 * 1024); + } + + [TearDown] + public void TearDown() + { + _blobs.Dispose(); + _memArena.Dispose(); + try { Directory.Delete(_blobsDir, recursive: true); } catch { /* best-effort */ } + try { Directory.Delete(_memArenaDir, recursive: true); } catch { /* best-effort */ } + } + + private PersistedSnapshot CreatePersistedSnapshot(StateId from, StateId to, byte[] data) => + TestFixtureHelpers.CreatePersistedSnapshot(_memArena, _blobs, from, to, data); + + [Test] + public void Trie_key_encoding_matches_persistence_tiers() + { + Span key = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + Hash256 addr = Keccak.Compute("addr"); + ReadOnlySpan addrHash = addr.Bytes; + + TreePath stateTop = new(Keccak.Compute("s"), 5); + TreePath stateCompact = new(Keccak.Compute("s"), 6); + TreePath storShort = new(Keccak.Compute("s"), 4); + TreePath storCompactMax = new(Keccak.Compute("s"), 15); + TreePath storFallback = new(Keccak.Compute("s"), 16); + + int stateTopLen = PersistedSnapshotKey.WriteStateNodeKey(key, in stateTop); + int stateCompactLen = PersistedSnapshotKey.WriteStateNodeKey(key, in stateCompact); + int storShortLen = PersistedSnapshotKey.WriteStorageNodeKey(key, addrHash, in storShort); + int storCompactMaxLen = PersistedSnapshotKey.WriteStorageNodeKey(key, addrHash, in storCompactMax); + int storFallbackLen = PersistedSnapshotKey.WriteStorageNodeKey(key, addrHash, in storFallback); + + // Slots live in their own top-level column that sorts just before the account column. + Span slotKey = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + Span slot = stackalloc byte[32]; + int slotLen = PersistedSnapshotKey.WriteSlotKey(slotKey, TestItem.AddressA.Bytes, slot); + int accountLen = PersistedSnapshotKey.WriteAccountKey(key, TestItem.AddressA.Bytes); + byte slotColumn = slotKey[0]; + byte accountColumn = key[0]; + + Assert.Multiple(() => + { + Assert.That(stateTopLen, Is.EqualTo(4), "state top (0-5): column + 3-byte path"); + Assert.That(stateCompactLen, Is.EqualTo(9), "state compact (6-15): column + 8-byte path"); + Assert.That(storShortLen, Is.EqualTo(30), "storage 0-15: column + addrHash(20) + sub + 8-byte path"); + Assert.That(storCompactMaxLen, Is.EqualTo(30), "storage upper bound (15) stays compact — never a 4-byte top key"); + Assert.That(storFallbackLen, Is.EqualTo(55), "storage 16+: column + addrHash(20) + sub + 33-byte path"); + Assert.That(slotLen, Is.EqualTo(53), "slot: own column + addr(20) + slot(32), no per-address sub-tag"); + Assert.That(slotColumn, Is.EqualTo(PersistedSnapshotKey.SlotColumn)); + Assert.That(slotColumn, Is.LessThan(accountColumn), "slot column sorts before the account column"); + Assert.That(accountLen, Is.EqualTo(21), "per-address: account column + addr(20), no sub-tag"); + }); + } + + private static IEnumerable RoundTripTestCases() + { + yield return new TestCaseData((Action)(c => + { + c.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(1000).TestObject; + })).SetName("Account"); + + yield return new TestCaseData((Action)(c => + { + c.SelfDestructedStorageAddresses[TestItem.AddressA] = false; + })).SetName("SelfDestruct"); + + yield return new TestCaseData((Action)(c => + { + TreePath path = new(Keccak.Compute("path"), 4); + c.StateNodes[path] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x80]); + })).SetName("StateNode_TopPath"); + + yield return new TestCaseData((Action)(c => + { + TreePath path = new(Keccak.Compute("path"), 8); + c.StateNodes[path] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x80]); + })).SetName("StateNode_CompactPath"); + + yield return new TestCaseData((Action)(c => + { + TreePath longPath = new(Keccak.Compute("longpath"), 20); + c.StateNodes[longPath] = new TrieNode(NodeType.Extension, [0xC2, 0x80, 0x81]); + })).SetName("StateNode_LongPath"); + + yield return new TestCaseData((Action)(c => + { + byte[] value = new byte[32]; + value[31] = 0xFF; + c.Storages[(TestItem.AddressA, (UInt256)42)] = new SlotValue(value); + })).SetName("Storage_SingleSlot"); + + // Single significant byte < 0x80: RLP wraps it to the byte itself (1 byte), so the + // stored length is still 1 — distinct from the length-0 absent sentinel. + yield return new TestCaseData((Action)(c => + { + byte[] value = new byte[32]; + value[31] = 0x05; + c.Storages[(TestItem.AddressA, (UInt256)9)] = new SlotValue(value); + })).SetName("Storage_SmallSingleByteSlot"); + + yield return new TestCaseData((Action)(c => + { + byte[] value = new byte[32]; + value[31] = 0xAB; + c.Storages[(TestItem.AddressA, UInt256.Zero)] = new SlotValue(value); + })).SetName("Storage_ZeroSlot"); + + yield return new TestCaseData((Action)(c => + { + c.Storages[(TestItem.AddressA, (UInt256)1)] = null; + byte[] val = new byte[32]; + val[31] = 0xFF; + c.Storages[(TestItem.AddressA, (UInt256)2)] = new SlotValue(val); + })).SetName("Storage_NullSlot"); + + yield return new TestCaseData((Action)(c => + { + byte[] val1 = new byte[32]; val1[31] = 0x01; + byte[] val2 = new byte[32]; val2[31] = 0x02; + byte[] val3 = new byte[32]; val3[31] = 0x03; + c.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(val1); + c.Storages[(TestItem.AddressA, (UInt256)2)] = new SlotValue(val2); + c.Storages[(TestItem.AddressB, (UInt256)5)] = new SlotValue(val3); + })).SetName("Storage_MultipleAddresses"); + + // Storage has no top tier — a length-4 path lands in the 8-byte compact encoding. + yield return new TestCaseData((Action)(c => + { + Hash256 address = Keccak.Compute("address"); + TreePath path = new(Keccak.Compute("path"), 4); + c.StorageNodes[(address, path)] = new TrieNode(NodeType.Branch, [0xC1, 0x80]); + })).SetName("StorageNode_ShortPath"); + + yield return new TestCaseData((Action)(c => + { + Hash256 address = Keccak.Compute("address"); + TreePath path = new(Keccak.Compute("path"), 6); + c.StorageNodes[(address, path)] = new TrieNode(NodeType.Branch, [0xC1, 0x80]); + })).SetName("StorageNode_CompactPath"); + + yield return new TestCaseData((Action)(c => + { + Hash256 address = Keccak.Compute("address"); + TreePath longPath = new(Keccak.Compute("longpath"), 18); + c.StorageNodes[(address, longPath)] = new TrieNode(NodeType.Branch, [0xC3, 0x80, 0x81, 0x82]); + })).SetName("StorageNode_LongPath"); + + yield return new TestCaseData((Action)(c => + { + c.Accounts[TestItem.AddressA] = Build.An.Account + .WithBalance(12345).WithNonce(7).TestObject; + c.Accounts[TestItem.AddressB] = Build.An.Account + .WithBalance(0).WithNonce(0) + .WithCode([0x60, 0x00]) + .WithStorageRoot(Keccak.Compute("storage")).TestObject; + c.Accounts[TestItem.AddressC] = null; + + byte[] slotVal1 = new byte[32]; slotVal1[31] = 0xFF; + byte[] slotVal2 = new byte[32]; slotVal2[0] = 0x01; slotVal2[31] = 0x02; + c.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(slotVal1); + c.Storages[(TestItem.AddressA, (UInt256)2)] = new SlotValue(slotVal2); + c.Storages[(TestItem.AddressB, (UInt256)42)] = null; + + c.SelfDestructedStorageAddresses[TestItem.AddressD] = false; + c.SelfDestructedStorageAddresses[TestItem.AddressE] = true; + + TreePath topStatePath = new(Keccak.Compute("tp"), 3); + c.StateNodes[topStatePath] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + + TreePath shortStatePath = new(Keccak.Compute("sp"), 8); + c.StateNodes[shortStatePath] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x80]); + + TreePath longStatePath = new(Keccak.Compute("lp"), 20); + c.StateNodes[longStatePath] = new TrieNode(NodeType.Extension, [0xC2, 0x80, 0x81]); + + Hash256 storageAddr = Keccak.Compute("storageAddr"); + TreePath topStoragePath = new(Keccak.Compute("tsp"), 3); + c.StorageNodes[(storageAddr, topStoragePath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + + TreePath shortStoragePath = new(Keccak.Compute("ssp"), 6); + c.StorageNodes[(storageAddr, shortStoragePath)] = new TrieNode(NodeType.Branch, [0xC1, 0x80]); + + TreePath longStoragePath = new(Keccak.Compute("lsp"), 18); + c.StorageNodes[(storageAddr, longStoragePath)] = new TrieNode(NodeType.Leaf, [0xC3, 0x80, 0x81, 0x82]); + })).SetName("AllDataTypes"); + } + + [TestCaseSource(nameof(RoundTripTestCases))] + public void RoundTrip(Action populateContent) + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to = new(1, Keccak.Compute("1")); + + SnapshotContent content = new(); + populateContent(content); + + Snapshot snapshot = new(from, to, content, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(snapshot, _blobs); + PersistedSnapshot persisted = CreatePersistedSnapshot(from, to, data); + + Assert.DoesNotThrow(() => PersistedSnapshotUtils.ValidatePersistedSnapshot(snapshot, persisted)); + } + + // Regression: a storage-trie node record can land within <12 bytes of a 4 KiB boundary in a + // region-relative (SpanByteReader-scoped) read; TryLoadNode used to clamp the speculative + // window to that short page remainder and overrun the 12-byte header. A single account with + // ~280 spread-out slots places such a node; reading every slot back must not throw. + [Test] + public void StorageNode_NearPageBoundary_RoundTrips() + { + Address a = TestItem.AddressA; + const int slotCount = 280; + + SnapshotContent content = new(); + content.Accounts[a] = Build.An.Account.WithBalance(1).TestObject; + SlotValue[] expected = new SlotValue[slotCount]; + UInt256[] keys = new UInt256[slotCount]; + for (int i = 0; i < slotCount; i++) + { + keys[i] = new UInt256(Keccak.Compute(i.ToString()).Bytes, isBigEndian: true); + byte[] v = new byte[32]; + v[31] = (byte)((i % 255) + 1); + expected[i] = new SlotValue(v); + content.Storages[(a, keys[i])] = expected[i]; + } + + StateId from = new(0, Keccak.EmptyTreeHash), to = new(1, Keccak.Compute("to")); + string arenaDir = Path.Combine(Path.GetTempPath(), $"nm-regr-arena-{Guid.NewGuid():N}"); + using ArenaManager arena = TestFixtureHelpers.CreateArenaManager(arenaDir, 64 * 1024 * 1024); + string blobsDir = Path.Combine(Path.GetTempPath(), $"nm-regr-{Guid.NewGuid():N}"); + using BlobArenaManager blobs = new(blobsDir, 64L * 1024 * 1024); + try + { + Snapshot snapshot = new(from, to, content, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(snapshot, blobs); + using PersistedSnapshot persisted = TestFixtureHelpers.CreatePersistedSnapshot(arena, blobs, from, to, data); + + Assert.DoesNotThrow(() => + { + for (int i = 0; i < slotCount; i++) + { + SlotValue got = default; + Assert.That(persisted.TryGetSlot(a, keys[i], ref got), Is.True, $"slot {i} missing"); + Assert.That(got.AsReadOnlySpan.SequenceEqual(expected[i].AsReadOnlySpan), Is.True, $"slot {i} mismatch"); + } + }); + } + finally + { + try { Directory.Delete(blobsDir, recursive: true); } catch { /* best-effort */ } + try { Directory.Delete(arenaDir, recursive: true); } catch { /* best-effort */ } + } + } + + // Covers the scanner slot-decode path (PersistedSnapshotScanner.SlotEntry.Value), which + // PersistPersistedSnapshot uses to flush slots back into the flat DB. Slot values are now + // RLP-wrapped; this asserts varied widths (1-byte < 0x80, 1-byte >= 0x80, full 32 bytes) + // decode correctly and that a null/deleted slot is surfaced as null (length-0 sentinel). + [Test] + public void Slot_scanner_round_trips_rlp_wrapped_values() + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to = new(1, Keccak.Compute("scan")); + + byte[] small = new byte[32]; small[31] = 0x05; // RLP(0x05) = 0x05 + byte[] high = new byte[32]; high[31] = 0xFF; // RLP(0xff) = 0x81 0xff + byte[] full = new byte[32]; + for (int i = 0; i < 32; i++) full[i] = (byte)(i + 1); // RLP = 0xa0 + 32 bytes + + SnapshotContent content = new(); + content.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(small); + content.Storages[(TestItem.AddressA, (UInt256)2)] = new SlotValue(high); + content.Storages[(TestItem.AddressA, (UInt256)3)] = null; + content.Storages[(TestItem.AddressB, (UInt256)4)] = new SlotValue(full); + + Snapshot snapshot = new(from, to, content, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(snapshot, _blobs); + using PersistedSnapshot persisted = CreatePersistedSnapshot(from, to, data); + + Dictionary<(Address, UInt256), SlotValue?> scanned = []; + using (WholeReadSession session = persisted.BeginWholeReadSession()) + { + WholeReadScanner scanner = PersistedSnapshotScanner.ForWholeRead(session, persisted); + foreach (WholeReadScanner.SlotEntry slot in scanner.Slots) + scanned[(slot.Address, slot.Slot)] = slot.Value; + } + + Assert.That(scanned[(TestItem.AddressA, (UInt256)1)]!.Value.AsReadOnlySpan.ToArray(), Is.EqualTo(small)); + Assert.That(scanned[(TestItem.AddressA, (UInt256)2)]!.Value.AsReadOnlySpan.ToArray(), Is.EqualTo(high)); + Assert.That(scanned[(TestItem.AddressA, (UInt256)3)], Is.Null, "deleted slot must surface as null"); + Assert.That(scanned[(TestItem.AddressB, (UInt256)4)]!.Value.AsReadOnlySpan.ToArray(), Is.EqualTo(full)); + } + + // Drives the scanner across every entry kind in one pass: normal vs deleted account, + // self-destruct destructed vs new, an address with a self-destruct but no account change, + // present vs deleted slot, and state/storage trie nodes spread across all three depth tiers. + [Test] + public void FullScan_DecodesAccounts_SelfDestruct_Slots_StateAndStorageNodes() + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to = new(1, Keccak.Compute("fullscan")); + + byte[] slotVal = new byte[32]; slotVal[31] = 0x11; + + SnapshotContent content = new(); + content.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(1000).WithNonce(3).TestObject; + content.Accounts[TestItem.AddressC] = null; // deleted marker + content.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(slotVal); + content.Storages[(TestItem.AddressA, (UInt256)2)] = null; + content.SelfDestructedStorageAddresses[TestItem.AddressD] = false; // destructed + content.SelfDestructedStorageAddresses[TestItem.AddressE] = true; // new-account + TreePath stTop = new(Keccak.Compute("st-top"), 3); + TreePath stMid = new(Keccak.Compute("st-mid"), 8); + TreePath stLong = new(Keccak.Compute("st-long"), 20); + content.StateNodes[stTop] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + content.StateNodes[stMid] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x80]); + content.StateNodes[stLong] = new TrieNode(NodeType.Extension, [0xC2, 0x80, 0x81]); + Hash256 storageAddr = Keccak.Compute("storage-addr"); + TreePath snTop = new(Keccak.Compute("sn-top"), 3); + TreePath snMid = new(Keccak.Compute("sn-mid"), 6); + TreePath snLong = new(Keccak.Compute("sn-long"), 18); + content.StorageNodes[(storageAddr, snTop)] = new TrieNode(NodeType.Leaf, [0xC1, 0x81]); + content.StorageNodes[(storageAddr, snMid)] = new TrieNode(NodeType.Branch, [0xC1, 0x82]); + content.StorageNodes[(storageAddr, snLong)] = new TrieNode(NodeType.Leaf, [0xC3, 0x80, 0x81, 0x82]); + + Snapshot snapshot = new(from, to, content, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(snapshot, _blobs); + using PersistedSnapshot persisted = CreatePersistedSnapshot(from, to, data); + + Dictionary perAddr = []; + Dictionary<(Address, UInt256), SlotValue?> slots = []; + int stateNodes = 0, storageNodes = 0; + + using (WholeReadSession session = persisted.BeginWholeReadSession()) + { + WholeReadScanner scanner = PersistedSnapshotScanner.ForWholeRead(session, persisted); + foreach (WholeReadScanner.PerAddressEntry e in scanner.PerAddresses) + perAddr[e.Address] = (e.HasAccount, e.Account?.Balance, e.SelfDestructFlag); + foreach (WholeReadScanner.SlotEntry s in scanner.Slots) + slots[(s.Address, s.Slot)] = s.Value; + foreach (WholeReadScanner.StateNodeEntry n in scanner.StateNodes) + { + _ = n.Path; // exercise the stage-specific path decode + Assert.That(n.Rlp.Length, Is.GreaterThan(0)); + stateNodes++; + } + foreach (WholeReadScanner.StorageNodeEntry n in scanner.StorageNodes) + { + _ = n.Path; + _ = n.AddressHash; + Assert.That(n.Rlp.Length, Is.GreaterThan(0)); + storageNodes++; + } + } + + Assert.That(perAddr[TestItem.AddressA].HasAccount, Is.True); + Assert.That(perAddr[TestItem.AddressA].Balance, Is.EqualTo((UInt256)1000)); + Assert.That(perAddr[TestItem.AddressA].Sd, Is.Null, "address with no self-destruct → null flag"); + Assert.That(perAddr[TestItem.AddressC].HasAccount, Is.True, "deleted account still has a per-address entry"); + Assert.That(perAddr[TestItem.AddressC].Balance, Is.Null, "deleted account decodes to null"); + Assert.That(perAddr[TestItem.AddressD].HasAccount, Is.False, "self-destruct-only address has no account change"); + Assert.That(perAddr[TestItem.AddressD].Sd, Is.False, "destructed → false"); + Assert.That(perAddr[TestItem.AddressE].Sd, Is.True, "new account → true"); + + Assert.That(slots[(TestItem.AddressA, (UInt256)1)]!.Value.AsReadOnlySpan.ToArray(), Is.EqualTo(slotVal)); + Assert.That(slots[(TestItem.AddressA, (UInt256)2)], Is.Null, "deleted slot surfaces as null"); + + Assert.That(stateNodes, Is.EqualTo(3), "one state node per depth tier"); + Assert.That(storageNodes, Is.EqualTo(3), "one storage node per depth tier"); + } + + // When a column / sub-tag tier is absent, the enumerators must seek past it gracefully: + // state nodes only in the top tier, storage nodes only in the fallback tier, and no + // per-address column at all. + [Test] + public void Scan_AbsentTiers_SkipMissingColumnsAndSubTags() + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to = new(1, Keccak.Compute("absent")); + + SnapshotContent content = new(); + TreePath onlyTop = new(Keccak.Compute("only-top"), 3); + content.StateNodes[onlyTop] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + Hash256 storageAddr = Keccak.Compute("absent-storage"); + TreePath onlyFallback = new(Keccak.Compute("only-fallback"), 18); + content.StorageNodes[(storageAddr, onlyFallback)] = new TrieNode(NodeType.Leaf, [0xC3, 0x80, 0x81, 0x82]); + + Snapshot snapshot = new(from, to, content, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(snapshot, _blobs); + using PersistedSnapshot persisted = CreatePersistedSnapshot(from, to, data); + + int perAddrCount = 0, stateNodes = 0, storageNodes = 0; + using (WholeReadSession session = persisted.BeginWholeReadSession()) + { + WholeReadScanner scanner = PersistedSnapshotScanner.ForWholeRead(session, persisted); + foreach (WholeReadScanner.PerAddressEntry _ in scanner.PerAddresses) perAddrCount++; + foreach (WholeReadScanner.StateNodeEntry n in scanner.StateNodes) { _ = n.Path; stateNodes++; } + foreach (WholeReadScanner.StorageNodeEntry n in scanner.StorageNodes) { _ = n.Path; storageNodes++; } + } + + Assert.That(perAddrCount, Is.EqualTo(0), "no per-address column → empty enumeration"); + Assert.That(stateNodes, Is.EqualTo(1), "only the top-tier state node, compact/fallback columns absent"); + Assert.That(storageNodes, Is.EqualTo(1), "only the fallback-tier storage node, top/compact sub-tags absent"); + } + + // Exercises the read-path miss branches: a present snapshot queried for keys that are + // absent at every level — unknown address, present-address/absent-slot, present-address/ + // no-self-destruct, absent state node, absent storage addressHash, and present-addressHash/ + // absent-path (same and different sub-tag tier). + [Test] + public void Queries_ForAbsentKeys_ReturnMisses() + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to = new(1, Keccak.Compute("miss")); + + byte[] slotVal = new byte[32]; slotVal[31] = 0x07; + SnapshotContent content = new(); + content.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(5).TestObject; + content.Accounts[TestItem.AddressC] = Build.An.Account.WithBalance(9).TestObject; // 2nd address → real address BTree + content.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(slotVal); + content.SelfDestructedStorageAddresses[TestItem.AddressA] = true; + TreePath statePath = new(Keccak.Compute("sp"), 4); + content.StateNodes[statePath] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + Hash256 storageHashObj = Keccak.Compute("sh"); + TreePath storagePath = new(Keccak.Compute("stp"), 4); + content.StorageNodes[(storageHashObj, storagePath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x81]); + + Snapshot snapshot = new(from, to, content, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(snapshot, _blobs); + using PersistedSnapshot persisted = CreatePersistedSnapshot(from, to, data); + + SlotValue sv = default; + // Unknown address: BTree seek misses. + Assert.That(persisted.TryGetAccount(TestItem.AddressB, out Account? accB), Is.False); + Assert.That(accB, Is.Null); + Assert.That(persisted.TryGetSlot(TestItem.AddressB, (UInt256)1, ref sv), Is.False); + Assert.That(persisted.TryGetSelfDestructFlag(TestItem.AddressB), Is.Null); + + // Present address, absent slot index; present address with no slot/self-destruct sub-tag. + Assert.That(persisted.TryGetSlot(TestItem.AddressA, (UInt256)999, ref sv), Is.False); + Assert.That(persisted.TryGetSlot(TestItem.AddressC, (UInt256)1, ref sv), Is.False); + Assert.That(persisted.TryGetSelfDestructFlag(TestItem.AddressC), Is.Null); + + // Absent state node. + Assert.That(persisted.TryLoadStateNodeRlp(new TreePath(Keccak.Compute("absent"), 4), out byte[]? sn), Is.False); + Assert.That(sn, Is.Null); + + // Storage node: absent addressHash; present addressHash with absent path in the same + // sub-tag tier and in a different (absent) tier. + ValueHash256 storageHash = new(storageHashObj.Bytes); + Assert.That(persisted.TryLoadStorageNodeRlp(new ValueHash256(Keccak.Compute("nope").Bytes), storagePath, out _), Is.False); + Assert.That(persisted.TryLoadStorageNodeRlp(storageHash, new TreePath(Keccak.Compute("absentSameTier"), 4), out _), Is.False); + Assert.That(persisted.TryLoadStorageNodeRlp(storageHash, new TreePath(Keccak.Compute("absentDeep"), 18), out _), Is.False); + + Assert.That(persisted.TryGetAccount(TestItem.AddressA, out _), Is.True); + Assert.That(persisted.TryLoadStorageNodeRlp(storageHash, storagePath, out _), Is.True); + } + + // An empty snapshot has no address column (cached BTree bound is empty) and no node + // columns, so every read returns a miss without faulting. + [Test] + public void Queries_OnEmptySnapshot_ReturnMisses() + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to = new(1, Keccak.Compute("empty-reads")); + Snapshot snapshot = new(from, to, new SnapshotContent(), _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(snapshot, _blobs); + using PersistedSnapshot persisted = CreatePersistedSnapshot(from, to, data); + + SlotValue sv = default; + Assert.That(persisted.TryGetAccount(TestItem.AddressA, out _), Is.False); + Assert.That(persisted.TryGetSlot(TestItem.AddressA, (UInt256)1, ref sv), Is.False); + Assert.That(persisted.TryGetSelfDestructFlag(TestItem.AddressA), Is.Null); + Assert.That(persisted.TryLoadStateNodeRlp(new TreePath(Keccak.Compute("p"), 4), out _), Is.False); + Assert.That(persisted.TryLoadStorageNodeRlp(new ValueHash256(Keccak.Compute("h").Bytes), new TreePath(Keccak.Compute("p"), 4), out _), Is.False); + + // Build-based snapshots carry no blob_range metadata → BlobRange.None → advise is a no-op. + Assert.DoesNotThrow(() => persisted.AdviseWillNeedBlobRange()); + Assert.DoesNotThrow(() => persisted.AdviseDontNeedBlobRange()); + } + + // Drives PersistedSnapshotStack's newest-first probe loops over a two-snapshot stack: + // hits in the newer and (after a newer miss) the older snapshot, full misses, the + // self-destruct slot boundary, and the detailed-metrics observations. + [Test] + public void Stack_ProbesNewestFirst_AcrossAllKinds() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("st1")); + StateId s2 = new(2, Keccak.Compute("st2")); + + byte[] v1 = new byte[32]; v1[31] = 0x11; + byte[] v2 = new byte[32]; v2[31] = 0x22; + + SnapshotContent older = new(); + older.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(100).TestObject; + older.Accounts[TestItem.AddressD] = Build.An.Account.WithBalance(40).TestObject; + older.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(v1); + older.SelfDestructedStorageAddresses[TestItem.AddressA] = false; + TreePath statePath = new(Keccak.Compute("st-p"), 4); + older.StateNodes[statePath] = new TrieNode(NodeType.Leaf, [0xC1, 0x80]); + Hash256 storageHashObj = Keccak.Compute("st-sh"); + TreePath storagePath = new(Keccak.Compute("st-sp"), 4); + older.StorageNodes[(storageHashObj, storagePath)] = new TrieNode(NodeType.Leaf, [0xC1, 0x81]); + + SnapshotContent newer = new(); + newer.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(200).TestObject; + newer.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(7).TestObject; + newer.Storages[(TestItem.AddressA, (UInt256)2)] = new SlotValue(v2); + + byte[] olderData = PersistedSnapshotBuilderTestExtensions.Build( + new Snapshot(s0, s1, older, _resourcePool, ResourcePool.Usage.MainBlockProcessing), _blobs); + byte[] newerData = PersistedSnapshotBuilderTestExtensions.Build( + new Snapshot(s1, s2, newer, _resourcePool, ResourcePool.Usage.MainBlockProcessing), _blobs); + + PersistedSnapshotList list = new(2) { CreatePersistedSnapshot(s0, s1, olderData), CreatePersistedSnapshot(s1, s2, newerData) }; + using PersistedSnapshotStack stack = new(list, recordDetailedMetrics: true); + + // Account: newest wins; older-only address resolves after the newer miss; full miss. + Assert.That(stack.TryGetAccount(TestItem.AddressA, out Account? a), Is.True); + Assert.That(a!.Balance, Is.EqualTo((UInt256)200), "newest snapshot wins"); + Assert.That(stack.TryGetAccount(TestItem.AddressD, out Account? d), Is.True); + Assert.That(d!.Balance, Is.EqualTo((UInt256)40), "older-only address resolves after newer miss"); + Assert.That(stack.TryGetAccount(TestItem.AddressF, out _), Is.False); + + // Self-destruct: only the older snapshot carries it. + Assert.That(stack.TryGetSelfDestruct(TestItem.AddressA, out int sdIdx), Is.True); + Assert.That(sdIdx, Is.EqualTo(0)); + Assert.That(stack.TryGetSelfDestruct(TestItem.AddressF, out _), Is.False); + + long start = System.Diagnostics.Stopwatch.GetTimestamp(); + // Slot: newer holds slot 2, older holds slot 1; both resolve. + Assert.That(stack.TryGetSlot(TestItem.AddressA, (UInt256)2, -1, start, out byte[]? sv2), Is.True); + Assert.That(sv2![^1], Is.EqualTo((byte)0x22)); // ToEvmBytes strips leading zeros + Assert.That(stack.TryGetSlot(TestItem.AddressA, (UInt256)1, -1, start, out byte[]? sv1), Is.True); + Assert.That(sv1![^1], Is.EqualTo((byte)0x11)); + // Slot below the self-destruct boundary resolves to null (storage wiped). + Assert.That(stack.TryGetSlot(TestItem.AddressA, (UInt256)999, 0, start, out byte[]? svNull), Is.True); + Assert.That(svNull, Is.Null); + // Slot fully absent (no boundary) falls through. + Assert.That(stack.TryGetSlot(TestItem.AddressF, (UInt256)1, -1, start, out _), Is.False); + + // State / storage node RLP: present (in older) and absent. + Assert.That(stack.TryLoadStateRlp(statePath, out byte[]? srlp), Is.True); + Assert.That(srlp, Is.Not.Null); + Assert.That(stack.TryLoadStateRlp(new TreePath(Keccak.Compute("nope-st"), 4), out _), Is.False); + Assert.That(stack.TryLoadStorageRlp(storageHashObj, storagePath, out byte[]? strlp), Is.True); + Assert.That(strlp, Is.Not.Null); + Assert.That(stack.TryLoadStorageRlp(storageHashObj, new TreePath(Keccak.Compute("nope-sp"), 4), out _), Is.False); + } + + [Test] + public void ActivePersistedSnapshotCount_TracksConstructionAndDisposal() + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to1 = new(1, Keccak.Compute("one")); + StateId to2 = new(2, Keccak.Compute("two")); + + Snapshot inMem1 = new(from, to1, new SnapshotContent(), _resourcePool, ResourcePool.Usage.MainBlockProcessing); + Snapshot inMem2 = new(from, to2, new SnapshotContent(), _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data1 = PersistedSnapshotBuilderTestExtensions.Build(inMem1, _blobs); + byte[] data2 = PersistedSnapshotBuilderTestExtensions.Build(inMem2, _blobs); + + long baseline = Active(); + + PersistedSnapshot s1 = CreatePersistedSnapshot(from, to1, data1); + PersistedSnapshot s2 = CreatePersistedSnapshot(from, to2, data2); + + Assert.That(Active(), Is.EqualTo(baseline + 2)); + + s1.Dispose(); + Assert.That(Active(), Is.EqualTo(baseline + 1)); + + s2.Dispose(); + Assert.That(Active(), Is.EqualTo(baseline)); + + static long Active() + { + long total = 0; + foreach (KeyValuePair kv in Metrics.ActivePersistedSnapshotCount) + total += kv.Value; + return total; + } + } + + [Test] + public void BlobArena_FrontierResets_WhenLastPersistedSnapshotDisposes() + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to = new(1, Keccak.Compute("reset")); + + Snapshot inMem = new(from, to, new SnapshotContent(), _resourcePool, ResourcePool.Usage.MainBlockProcessing); + TreePath path = new(Keccak.Compute("p"), 8); + inMem.Content.StateNodes[path] = new TrieNode(NodeType.Leaf, [0xC2, 0x80, 0x80]); + + long baselineBytes = Metrics.BlobAllocatedBytes; + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(inMem, _blobs); + long afterBuild = Metrics.BlobAllocatedBytes; + Assert.That(afterBuild, Is.GreaterThan(baselineBytes), "Building a snapshot with trie nodes should grow blob-allocated bytes"); + + // Skip LeaseBlobIds: it acquires an extra lease per blob id that other + // tests rely on but that this test must not leave dangling, otherwise the + // orphan-reset would correctly refuse to fire. + TestFixtureHelpers.CreatePersistedSnapshot(_memArena, _blobs, from, to, data, leaseBlobIds: false) + .Dispose(); + + // After the last external lease drops, the manager's TryResetOrphanedFrontier + // should have reset the file's frontier and pushed the delta back to the gauge. + Assert.That(Metrics.BlobAllocatedBytes, Is.EqualTo(baselineBytes), + "Blob-allocated bytes must drop back to baseline once the last referencing snapshot is disposed"); + } + + [TestCase((ushort)0, 0)] + [TestCase((ushort)42, 12345)] + [TestCase(ushort.MaxValue, int.MaxValue)] + public void NodeRef_ReadWrite_RoundTrip(ushort id, int offset) + { + Assert.That(NodeRef.Size, Is.EqualTo(6)); + NodeRef original = new(id, offset); + byte[] buffer = new byte[NodeRef.Size]; + NodeRef.Write(buffer, original); + NodeRef decoded = NodeRef.Read(buffer); + + Assert.That(decoded.BlobArenaId, Is.EqualTo(id)); + Assert.That(decoded.RlpDataOffset, Is.EqualTo(offset)); + } + + [Test] + public void PersistedSnapshotList_Queries_NewestFirst() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + + // path length 4 → StateTopNodes column + TreePath path = new(Keccak.Compute("path"), 4); + byte[] rlp1 = [0xC0]; + byte[] rlp2 = [0xC1, 0x80]; + + SnapshotContent content1 = new(); + content1.StateNodes[path] = new TrieNode(NodeType.Leaf, rlp1); + Snapshot snap1 = new(s0, s1, content1, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + + SnapshotContent content2 = new(); + content2.StateNodes[path] = new TrieNode(NodeType.Leaf, rlp2); + Snapshot snap2 = new(s1, s2, content2, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + + byte[] data1 = PersistedSnapshotBuilderTestExtensions.Build(snap1, _blobs); + byte[] data2 = PersistedSnapshotBuilderTestExtensions.Build(snap2, _blobs); + + PersistedSnapshot p1 = CreatePersistedSnapshot(s0, s1, data1); + PersistedSnapshot p2 = CreatePersistedSnapshot(s1, s2, data2); + + // Ordered oldest-first; query newest-first via indexer + PersistedSnapshotList list = new(2) { p1, p2 }; + byte[]? result = null; + bool found = false; + for (int i = list.Count - 1; i >= 0; i--) + { + if (list[i].TryLoadStateNodeRlp(path, out result)) + { + found = true; + break; + } + } + + Assert.That(found, Is.True); + Assert.That(result, Is.EqualTo(rlp2)); + } + + [Test] + public void Storage_NestedMerge_OverlappingAddresses() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + + Address addrA = TestItem.AddressA; + Address addrB = TestItem.AddressB; + byte[] val1 = new byte[32]; val1[31] = 0x01; + byte[] val2 = new byte[32]; val2[31] = 0x02; + byte[] val3 = new byte[32]; val3[31] = 0x03; + + SnapshotContent content1 = new(); + content1.Storages[(addrA, (UInt256)1)] = new SlotValue(val1); + content1.Storages[(addrB, (UInt256)5)] = new SlotValue(val2); + Snapshot snap1 = new(s0, s1, content1, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data1 = PersistedSnapshotBuilderTestExtensions.Build(snap1, _blobs); + + SnapshotContent content2 = new(); + content2.Storages[(addrA, (UInt256)1)] = new SlotValue(val3); + content2.Storages[(addrA, (UInt256)2)] = new SlotValue(val2); + Snapshot snap2 = new(s1, s2, content2, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data2 = PersistedSnapshotBuilderTestExtensions.Build(snap2, _blobs); + + PersistedSnapshotList toMerge = new(2) { CreatePersistedSnapshot(s0, s1, data1), CreatePersistedSnapshot(s1, s2, data2) }; + byte[] merged = PersistedSnapshotBuilderTestExtensions.NWayMergeSnapshots(toMerge); + PersistedSnapshot persisted = CreatePersistedSnapshot(s0, s2, merged); + + SlotValue slot1 = default; + Assert.That(persisted.TryGetSlot(addrA, (UInt256)1, ref slot1), Is.True); + Assert.That(slot1.ToEvmBytes()[0], Is.EqualTo(0x03)); + + SlotValue slot2 = default; + Assert.That(persisted.TryGetSlot(addrA, (UInt256)2, ref slot2), Is.True); + Assert.That(slot2.ToEvmBytes()[0], Is.EqualTo(0x02)); + + SlotValue slot5 = default; + Assert.That(persisted.TryGetSlot(addrB, (UInt256)5, ref slot5), Is.True); + Assert.That(slot5.ToEvmBytes()[0], Is.EqualTo(0x02)); + } + + private static IEnumerable NullSlotMergeCases() + { + byte[] nonZero = new byte[32]; + nonZero[31] = 0xFF; + + yield return new TestCaseData( + (Action)(c => c.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(nonZero)), + (Action)(c => c.Storages[(TestItem.AddressA, (UInt256)1)] = null), + (Action)(persisted => + { + SlotValue slot = default; + Assert.That(persisted.TryGetSlot(TestItem.AddressA, (UInt256)1, ref slot), Is.True); + Assert.That(slot.AsReadOnlySpan.IndexOfAnyExcept((byte)0), Is.EqualTo(-1), "Null slot should override value after merge"); + })).SetName("NullOverridesValue"); + + yield return new TestCaseData( + (Action)(c => c.Storages[(TestItem.AddressA, (UInt256)1)] = null), + (Action)(c => c.Storages[(TestItem.AddressA, (UInt256)1)] = new SlotValue(nonZero)), + (Action)(persisted => + { + SlotValue slot = default; + Assert.That(persisted.TryGetSlot(TestItem.AddressA, (UInt256)1, ref slot), Is.True); + Assert.That(slot.ToEvmBytes().Length, Is.GreaterThan(0), "Value should override null slot after merge"); + })).SetName("ValueOverridesNull"); + + yield return new TestCaseData( + (Action)(c => c.Storages[(TestItem.AddressA, (UInt256)1)] = null), + (Action)(c => c.Storages[(TestItem.AddressA, (UInt256)2)] = new SlotValue(nonZero)), + (Action)(persisted => + { + SlotValue slot1 = default; + Assert.That(persisted.TryGetSlot(TestItem.AddressA, (UInt256)1, ref slot1), Is.True); + Assert.That(slot1.AsReadOnlySpan.IndexOfAnyExcept((byte)0), Is.EqualTo(-1), "Null slot from older should be preserved"); + + SlotValue slot2 = default; + Assert.That(persisted.TryGetSlot(TestItem.AddressA, (UInt256)2, ref slot2), Is.True); + Assert.That(slot2.AsReadOnlySpan.IndexOfAnyExcept((byte)0), Is.GreaterThanOrEqualTo(0), "Value from newer should be present"); + })).SetName("NullPreservedAndValueCarried"); + } + + [TestCaseSource(nameof(NullSlotMergeCases))] + public void Storage_NullSlot_Merge( + Action populateOlder, + Action populateNewer, + Action verify) + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + + SnapshotContent olderContent = new(); + populateOlder(olderContent); + Snapshot older = new(s0, s1, olderContent, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] dataOlder = PersistedSnapshotBuilderTestExtensions.Build(older, _blobs); + + SnapshotContent newerContent = new(); + populateNewer(newerContent); + Snapshot newer = new(s1, s2, newerContent, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] dataNewer = PersistedSnapshotBuilderTestExtensions.Build(newer, _blobs); + + PersistedSnapshotList toMerge = new(2) { CreatePersistedSnapshot(s0, s1, dataOlder), CreatePersistedSnapshot(s1, s2, dataNewer) }; + byte[] merged = PersistedSnapshotBuilderTestExtensions.NWayMergeSnapshots(toMerge); + PersistedSnapshot persisted = CreatePersistedSnapshot(s0, s2, merged); + + verify(persisted); + } + + // Round-trips account / self-destruct / slot / storage-node across a range of slot counts, + // including a multi-page snapshot, then re-reads after AdviseDontNeed drops the kernel pages. + [TestCase(4)] + [TestCase(400)] + [TestCase(4000)] + public void RoundTrips_AcrossSlotCounts(int slotCount) + { + StateId from = new(0, Keccak.EmptyTreeHash); + StateId to = new(1, Keccak.Compute("warmup")); + + Address addr = TestItem.AddressA; + Hash256 addrHashKey = new(addr.ToAccountPath.Bytes); + Account expectedAccount = Build.An.Account.WithBalance(987654321).WithNonce(11).TestObject; + TreePath storagePath = new(Keccak.Compute("warmup-spath"), 6); + TrieNode storageNode = new(NodeType.Branch, [0xC3, 0x80, 0x81, 0x82]); + + SnapshotContent content = new(); + content.Accounts[addr] = expectedAccount; + content.SelfDestructedStorageAddresses[addr] = true; + content.StorageNodes[(addrHashKey, storagePath)] = storageNode; + for (int i = 0; i < slotCount; i++) + { + byte[] val = new byte[32]; + BinaryPrimitives.WriteInt32BigEndian(val.AsSpan(28, 4), i + 1); + content.Storages[(addr, (UInt256)i + 1)] = new SlotValue(val); + } + + Snapshot snapshot = new(from, to, content, _resourcePool, ResourcePool.Usage.MainBlockProcessing); + byte[] data = PersistedSnapshotBuilderTestExtensions.Build(snapshot, _blobs); + // The flat sorted table materialises a full record per slot, so a large slot count exceeds + // the shared 64 KiB fixture arena — use a roomier local arena for this case. + string arenaDir = Path.Combine(Path.GetTempPath(), $"nm-pstest-rt-{Guid.NewGuid():N}"); + using ArenaManager arena = TestFixtureHelpers.CreateArenaManager(arenaDir, 64 * 1024 * 1024); + using PersistedSnapshot persisted = TestFixtureHelpers.CreatePersistedSnapshot(arena, _blobs, from, to, data); + + // Per-address entries are keyed by raw Address; storage-trie reads take the addressHash. + ValueHash256 addrHash = addr.ToAccountPath; + + Assert.That(persisted.TryGetAccount(addr, out Account? acc1), Is.True); + Assert.That(acc1, Is.Not.Null); + Assert.That(acc1!.Balance, Is.EqualTo(expectedAccount.Balance)); + Assert.That(acc1.Nonce, Is.EqualTo(expectedAccount.Nonce)); + + Assert.That(persisted.TryGetSelfDestructFlag(addr), Is.EqualTo((bool?)true)); + + UInt256 probeIndex = (UInt256)(Math.Min(slotCount, 3)); + SlotValue slot1 = default; + Assert.That(persisted.TryGetSlot(addr, probeIndex, ref slot1), Is.True); + byte[] expectedSlotVal = new byte[32]; + BinaryPrimitives.WriteInt32BigEndian(expectedSlotVal.AsSpan(28, 4), (int)probeIndex); + Assert.That(slot1.AsReadOnlySpan.SequenceEqual(expectedSlotVal), Is.True); + + Assert.That(persisted.TryLoadStorageNodeRlp(addrHash, storagePath, out byte[]? nodeRlp1), Is.True); + Assert.That(nodeRlp1, Is.EqualTo(storageNode.FullRlp.ToArray())); + + // Second pass: results must match. + Assert.That(persisted.TryGetAccount(addr, out Account? acc2), Is.True); + Assert.That(acc2!.Balance, Is.EqualTo(expectedAccount.Balance)); + SlotValue slot2 = default; + Assert.That(persisted.TryGetSlot(addr, probeIndex, ref slot2), Is.True); + Assert.That(slot2.AsReadOnlySpan.SequenceEqual(expectedSlotVal), Is.True); + + // AdviseDontNeed advises the mmap range cold; the next reads re-fault any dropped page + // and the binary search must still resolve correctly. + persisted.AdviseDontNeed(); + Assert.That(persisted.TryGetAccount(addr, out Account? acc3), Is.True); + Assert.That(acc3!.Nonce, Is.EqualTo(expectedAccount.Nonce)); + SlotValue slot3 = default; + Assert.That(persisted.TryGetSlot(addr, probeIndex, ref slot3), Is.True); + Assert.That(slot3.AsReadOnlySpan.SequenceEqual(expectedSlotVal), Is.True); + + // Fresh miss for an unrelated address still works after AdviseDontNeed. + Assert.That(persisted.TryGetAccount(TestItem.AddressB, out _), Is.False); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/Persistence/BloomFilter/BloomFilterTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/Persistence/BloomFilter/BloomFilterTests.cs index 4d1e4e1e29f2..862f386d9a6a 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/Persistence/BloomFilter/BloomFilterTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/Persistence/BloomFilter/BloomFilterTests.cs @@ -97,6 +97,18 @@ public void Dispose_MultipleTimes_ShouldNotThrow() Assert.DoesNotThrow(() => bloom.Dispose()); } + [TestCase(0UL)] + [TestCase(1UL)] + [TestCase(0xDEADBEEFCAFEBABEUL)] + [TestCase(ulong.MaxValue)] + public void AlwaysTrue_MightContain_AnyKey_ReturnsTrue(ulong key) + { + using Nethermind.State.Flat.Persistence.BloomFilter.BloomFilter bloom = + Nethermind.State.Flat.Persistence.BloomFilter.BloomFilter.AlwaysTrue(); + + Assert.That(bloom.MightContain(key), Is.True, "AlwaysTrue sentinel must match every probe"); + } + [Test] public void MightContain_BeforeAnyAdds_ShouldReturnFalse() { diff --git a/src/Nethermind/Nethermind.State.Flat.Test/PersistenceManagerPersistedTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/PersistenceManagerPersistedTests.cs new file mode 100644 index 000000000000..27746f09ead9 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/PersistenceManagerPersistedTests.cs @@ -0,0 +1,198 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.IO; +using Nethermind.Core.Crypto; +using Nethermind.Core.Test.Builders; +using Nethermind.Db; +using Nethermind.State.Flat.PersistedSnapshots; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +[TestFixture] +public class PersistenceManagerPersistedTests +{ + private string _testDir = null!; + private ResourcePool _pool = null!; + + [SetUp] + public void SetUp() + { + _testDir = Path.Combine(Path.GetTempPath(), $"nethermind_test_{Guid.NewGuid():N}"); + Directory.CreateDirectory(_testDir); + _pool = new ResourcePool(new FlatDbConfig()); + } + + [TearDown] + public void TearDown() + { + if (Directory.Exists(_testDir)) + Directory.Delete(_testDir, recursive: true); + } + + [Test] + public void ConvertToPersistedSnapshot_PersistsViaManager() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + SnapshotContent content = new(); + content.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(500).TestObject; + Snapshot snap = new(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing); + + tier.ConvertToPersistedBase(snap).Dispose(); + + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(1)); + Assert.That(repo.TryLeasePersistedState(s1, SnapshotTier.PersistedBase, out PersistedSnapshot? snapshot), Is.True); + snapshot!.Dispose(); + } + + [Test] + public void PrunePersistedSnapshots_RemovesOldSnapshots() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s3 = new(3, Keccak.Compute("3")); + StateId s6 = new(6, Keccak.Compute("6")); + + SnapshotContent c1 = new(); + c1.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(1).TestObject; + tier.ConvertToPersistedBase(new Snapshot(s0, s1, c1, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + + SnapshotContent c2 = new(); + c2.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(2).TestObject; + tier.ConvertToPersistedBase(new Snapshot(s1, s3, c2, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + + SnapshotContent c3 = new(); + c3.Accounts[TestItem.AddressC] = Build.An.Account.WithBalance(3).TestObject; + tier.ConvertToPersistedBase(new Snapshot(s3, s6, c3, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(3)); + + // Snapshots with To.BlockNumber < 5 are removed (s1, s3); s6 survives. + repo.RemovePersistedStatesUntil(5); + + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(1)); + } + + [Test] + public void RemoveSiblingAndDescendents_CrossTier_PrunesPersistedOrphans_KeepsCanonicalThroughPersistedAncestor() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + StateId c3 = new(3, Keccak.Compute("c3")); + StateId c4 = new(4, Keccak.Compute("c4")); + StateId nc3 = new(3, Keccak.Compute("nc3")); + StateId nc4 = new(4, Keccak.Compute("nc4")); + StateId c5 = new(5, Keccak.Compute("c5")); + + // Persisted tier: common chain s0->s1->s2, canonical s2->C3->C4, and a non-canonical + // fork s2->NC3->NC4 diverging at block 3. + PersistToTier(tier, s0, s1); + PersistToTier(tier, s1, s2); + PersistToTier(tier, s2, c3); + PersistToTier(tier, c3, c4); + PersistToTier(tier, s2, nc3); + PersistToTier(tier, nc3, nc4); + + // In-memory canonical C5 whose parent C4 lives only in the persisted tier — reachability + // to C3 therefore has to cross from the in-memory tier into the persisted tier. + AddInMemory(repo, c4, c5); + + repo.RemoveSiblingAndDescendents(c3); + + Assert.That(LeasePresent(repo, nc4), Is.False, "orphan NC4 above the persisted block should be pruned from the persisted tier"); + Assert.That(LeasePresent(repo, c4), Is.True, "canonical C4 should be kept"); + Assert.That(repo.HasBaseSnapshot(c3), Is.True, "canonical target C3 should be kept"); + Assert.That(repo.HasBaseSnapshot(nc3), Is.True, "NC3 at the persisted block is left to RemoveStatesUntil"); + Assert.That(repo.HasState(c5), Is.True, "canonical in-memory C5 reachable through persisted C4 must be kept"); + } + + [Test] + public void RemoveSiblingAndDescendents_PersistedOrphanAboveInMemoryTip_IsPruned() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + StateId c3 = new(3, Keccak.Compute("c3")); + StateId nc3 = new(3, Keccak.Compute("nc3")); + StateId nc4 = new(4, Keccak.Compute("nc4")); + + // Persisted tier: common chain s0->s1->s2, canonical s2->C3, and a non-canonical fork + // s2->NC3->NC4 diverging at block 3 — NC4 is an orphan at block 4. + PersistToTier(tier, s0, s1); + PersistToTier(tier, s1, s2); + PersistToTier(tier, s2, c3); + PersistToTier(tier, s2, nc3); + PersistToTier(tier, nc3, nc4); + + // In-memory tip sits at the canonical block (3), BELOW the persisted orphan NC4 (block 4). + // The orphan walk's upper bound must come from the persisted tier, not the in-memory tip, + // or NC4 is never visited. + AddInMemory(repo, s2, c3); + + repo.RemoveSiblingAndDescendents(c3); + + Assert.That(LeasePresent(repo, nc4), Is.False, "persisted orphan NC4 above the in-memory tip should be pruned"); + Assert.That(repo.HasBaseSnapshot(c3), Is.True, "canonical C3 should be kept"); + Assert.That(repo.HasBaseSnapshot(nc3), Is.True, "NC3 at the persisted block is left to RemoveStatesUntil"); + } + + [Test] + public void RemoveSiblingAndDescendents_PersistedLinearChain_RemovesNothing() + { + using FlatTestContainer tier = new(arenaFileSizeBytes: 4096); + SnapshotRepository repo = tier.Repository; + + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + StateId s3 = new(3, Keccak.Compute("3")); + PersistToTier(tier, s0, s1); + PersistToTier(tier, s1, s2); + PersistToTier(tier, s2, s3); + + int before = repo.PersistedSnapshotCount; + repo.RemoveSiblingAndDescendents(s1); + + Assert.That(repo.PersistedSnapshotCount, Is.EqualTo(before), "a linear persisted chain has no fork; nothing should be pruned"); + Assert.That(repo.HasBaseSnapshot(s2), Is.True); + Assert.That(repo.HasBaseSnapshot(s3), Is.True); + } + + private void PersistToTier(FlatTestContainer tier, StateId from, StateId to) + { + SnapshotContent content = new(); + content.Accounts[TestItem.AddressA] = Build.An.Account.WithBalance(1).TestObject; + tier.ConvertToPersistedBase(new Snapshot(from, to, content, _pool, ResourcePool.Usage.MainBlockProcessing)).Dispose(); + } + + private void AddInMemory(SnapshotRepository repo, StateId from, StateId to) + { + SnapshotContent content = new(); + content.Accounts[TestItem.AddressB] = Build.An.Account.WithBalance(1).TestObject; + repo.TryAdd(new Snapshot(from, to, content, _pool, ResourcePool.Usage.MainBlockProcessing), SnapshotTier.InMemoryBase); + repo.AddStateId(to); + } + + private static bool LeasePresent(SnapshotRepository repo, StateId to) + { + if (!repo.TryLeasePersistedState(to, SnapshotTier.PersistedBase, out PersistedSnapshot? snapshot)) return false; + snapshot!.Dispose(); + return true; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/PersistenceManagerTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/PersistenceManagerTests.cs index 495a72a480c1..8aa835a30954 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/PersistenceManagerTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/PersistenceManagerTests.cs @@ -2,6 +2,8 @@ // SPDX-License-Identifier: LGPL-3.0-only using System.Collections.Generic; +using System.Threading.Tasks; +using Nethermind.Config; using Nethermind.Core; using Nethermind.Core.Crypto; using Nethermind.Core.Test.Builders; @@ -9,6 +11,7 @@ using Nethermind.Int256; using Nethermind.Logging; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.Trie; using Nethermind.Trie.Pruning; using NSubstitute; @@ -22,8 +25,10 @@ public class PersistenceManagerTests private PersistenceManager _persistenceManager = null!; private FlatDbConfig _config = null!; private TestFinalizedStateProvider _finalizedStateProvider = null!; + private FlatTestContainer _tier = null!; private SnapshotRepository _snapshotRepository = null!; private IPersistence _persistence = null!; + private IPersistedSnapshotCompactor _persistedSnapshotCompactor = null!; private ResourcePool _resourcePool = null!; private StateId Block0 = new(0, Keccak.EmptyTreeHash); @@ -34,30 +39,45 @@ public void SetUp() { CompactSize = 16, MinReorgDepth = 64, - MaxReorgDepth = 256 + MaxInMemoryBaseSnapshotCount = 128 + 32, + MaxReorgDepth = 90000, + LongFinalityMaxReorgDepth = 90000, + EnableLongFinality = true }; _resourcePool = new ResourcePool(_config); _finalizedStateProvider = new TestFinalizedStateProvider(); - _snapshotRepository = new SnapshotRepository(LimboLogs.Instance); + // SnapshotRepository owns both tiers over a real temp-dir-backed persisted store, wired the + // production way through FlatWorldStateModule; the container pairs it with its loader (load on + // build, teardown on dispose). + _tier = new FlatTestContainer(); + _snapshotRepository = _tier.Repository; _persistence = Substitute.For(); IPersistence.IPersistenceReader persistenceReader = Substitute.For(); persistenceReader.CurrentState.Returns(Block0); _persistence.CreateReader().Returns(persistenceReader); + _persistedSnapshotCompactor = Substitute.For(); + _persistenceManager = new PersistenceManager( _config, ScheduleHelper.CreateWithOffset(_config, 0), _finalizedStateProvider, _persistence, _snapshotRepository, - LimboLogs.Instance); + LimboLogs.Instance, + _persistedSnapshotCompactor, + _tier.Loader, + Substitute.For()); } [TearDown] - public void TearDown() + public async Task TearDown() { + _persistenceManager.Dispose(); + await _persistedSnapshotCompactor.DisposeAsync(); + _tier.Dispose(); } private StateId CreateStateId(long blockNumber, byte rootByte = 0) @@ -74,11 +94,11 @@ private Snapshot CreateSnapshot(StateId from, StateId to, bool compacted = false if (compacted) { - _snapshotRepository.TryAddCompactedSnapshot(snapshot); + _snapshotRepository.TryAdd(snapshot, SnapshotTier.InMemoryCompacted); } else { - _snapshotRepository.TryAddSnapshot(snapshot); + _snapshotRepository.TryAdd(snapshot, SnapshotTier.InMemoryBase); } // AddStateId is needed for GetStatesAtBlockNumber to work @@ -87,6 +107,14 @@ private Snapshot CreateSnapshot(StateId from, StateId to, bool compacted = false return snapshot; } + // Persist a base directly into the (real) persisted tier, bypassing the in-memory tier. + private void PersistBase(StateId from, StateId to) + { + Snapshot snapshot = _resourcePool.CreateSnapshot(from, to, ResourcePool.Usage.MainBlockProcessing); + snapshot.Content.Accounts[TestItem.AddressA] = new Account(1, 100); + _tier.ConvertToPersistedBase(snapshot).Dispose(); + } + private Snapshot CreateSnapshotWithSelfDestruct(StateId from, StateId to) { Snapshot snapshot = _resourcePool.CreateSnapshot(from, to, ResourcePool.Usage.ReadOnlyProcessingEnv); @@ -94,104 +122,393 @@ private Snapshot CreateSnapshotWithSelfDestruct(StateId from, StateId to) return snapshot; } - #region Basic Behavior Tests - [Test] - public void DetermineSnapshotToPersist_InsufficientInMemoryDepth_ReturnsNull() + public void DetermineSnapshotAction_InsufficientInMemoryDepth_ReturnsNull() { - // Setup: persisted at Block0 (0), latest at 60, after persist would be < 64 minimum + // Gate passes (60+16=76 > 64) but GetFinalizedStateRootAt(16) is not configured → seed = null. StateId persisted = Block0; StateId latest = CreateStateId(60); _finalizedStateProvider.SetFinalizedBlockNumber(100); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = _persistenceManager.DetermineSnapshotAction(latest); - Assert.That(result, Is.Null); + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Null); + Assert.That(toConvert, Is.Null); } - [TestCase(true, TestName = "DetermineSnapshotToPersist_SufficientDepthAndFinalized_ReturnsCompactedSnapshot")] - [TestCase(false, TestName = "DetermineSnapshotToPersist_SufficientDepthAndFinalized_FallsBackToUncompacted")] - public void DetermineSnapshotToPersist_SufficientDepthAndFinalized(bool useCompacted) + [TestCase(true, TestName = "DetermineSnapshotAction_SufficientDepthAndFinalized_ReturnsCompactedSnapshot")] + [TestCase(false, TestName = "DetermineSnapshotAction_SufficientDepthAndFinalized_BaseAtFinalizedBlock")] + public void DetermineSnapshotAction_SufficientDepthAndFinalized(bool useCompacted) { - // Setup: persisted at Block0, latest at 100, finalized at 100 + // Persisted at Block0, latest at 100, finalized at the target block (= the single seed). + // With CompactSize=16, finalized must be >= persisted + 16 for the normal-trigger seed to + // engage; the non-compacted case uses a base at block 16 to satisfy that gate. StateId persisted = Block0; StateId latest = CreateStateId(100); - // Vary target block and compaction based on parameter - int targetBlock = useCompacted ? 16 : 1; // compacted uses 16, fallback uses 1 - StateId target = CreateStateId(targetBlock); - - _finalizedStateProvider.SetFinalizedBlockNumber(100); - _finalizedStateProvider.SetFinalizedStateRootAt(targetBlock, new Hash256(target.StateRoot.Bytes)); + StateId target = CreateStateId(16); + _finalizedStateProvider.SetFinalizedBlockNumber(16); + _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(target.StateRoot.Bytes)); - // Create snapshot (compacted or not based on parameter) using Snapshot expectedSnapshot = CreateSnapshot(persisted, target, compacted: useCompacted); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = _persistenceManager.DetermineSnapshotAction(latest); - Assert.That(result, Is.Not.Null); - Assert.That(result!.From, Is.EqualTo(persisted)); - Assert.That(result.To, Is.EqualTo(target)); + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Not.Null); + Assert.That(toConvert, Is.Null); + Assert.That(toPersist!.From, Is.EqualTo(persisted)); + Assert.That(toPersist.To, Is.EqualTo(target)); - result.Dispose(); + toPersist.Dispose(); } - #endregion - - #region Unfinalized State Tests - [Test] - public void DetermineSnapshotToPersist_UnfinalizedButBelowForceLimit_ReturnsNull() + public void DetermineSnapshotAction_UnfinalizedButBelowForceLimit_ReturnsNull() { - // Setup: persisted at Block0, latest at 150, finalized at 10 (way behind) - // After persist would be at 16, which is > finalized - // But in-memory depth is 150 (< 256 forced boundary) + // Depth (150) is below LongFinalityMaxReorgDepth (90000), so the backstop doesn't fire. + // Finalized (10) < nextBoundary (16), so the normal-trigger gate also doesn't fire. + // Neither Phase 1 path activates; Phase 2 is below the SnapshotCount threshold. StateId persisted = Block0; StateId latest = CreateStateId(150); _finalizedStateProvider.SetFinalizedBlockNumber(10); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = _persistenceManager.DetermineSnapshotAction(latest); - Assert.That(result, Is.Null); + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Null); + Assert.That(toConvert, Is.Null); } - [TestCase(true, TestName = "DetermineSnapshotToPersist_UnfinalizedAndAboveForceLimit_ForcePersistsCompacted")] - [TestCase(false, TestName = "DetermineSnapshotToPersist_UnfinalizedAndAboveForceLimit_FallsBackToUncompacted")] - public void DetermineSnapshotToPersist_UnfinalizedAndAboveForceLimit(bool useCompacted) + [Test] + public void DetermineSnapshotAction_LongFinalityDisabled_SkipsConversionPath() { - // Setup: persisted at Block0, latest at 300, finalized at 10 - // In-memory depth is ~301 (> 256 forced boundary) + // In-memory depth ~301, finality stalled at block 10. With EnableLongFinality off, the + // conversion path must not fire and we must not invoke the converter. + _config.EnableLongFinality = false; + _persistenceManager = new PersistenceManager( + _config, + ScheduleHelper.CreateWithOffset(_config, 0), + _finalizedStateProvider, + _persistence, + _snapshotRepository, + LimboLogs.Instance, + _persistedSnapshotCompactor, + _tier.Loader, + Substitute.For()); + StateId persisted = Block0; StateId latest = CreateStateId(300); + StateId target = CreateStateId(1); + _finalizedStateProvider.SetFinalizedBlockNumber(10); + + using Snapshot expectedSnapshot = CreateSnapshot(persisted, target, compacted: false); + + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = _persistenceManager.DetermineSnapshotAction(latest); - // Vary target block and compaction based on parameter - int targetBlock = useCompacted ? 16 : 1; // compacted uses 16, fallback uses 1 - StateId target = CreateStateId(targetBlock); + // The load-bearing check: the long-finality conversion path is short-circuited. + // toPersist may still be populated by the normal finalized-snapshot-to-RocksDB + // fall-through (its behaviour is unchanged), but no persisted-snapshot conversion + // and no force-persisted-snapshot was returned. + Assert.That(persistedToPersist, Is.Null); + Assert.That(toConvert, Is.Null, "Conversion path must be gated when EnableLongFinality is false"); + // Sanity: with the flag off no snapshot was converted into the persisted tier. + toPersist?.Dispose(); + Assert.That(_snapshotRepository.PersistedSnapshotCount, Is.EqualTo(0)); + } + + [Test] + public void DetermineSnapshotAction_BackstopExceeded_SeedsFromInMemoryTier() + { + // Backstop: snapshotsDepth (95000) > LongFinalityMaxReorgDepth (90000), finalized not in range. + // Phase 1 must seed from the in-memory tier's latest registered state. + StateId latest = CreateStateId(95000); + // tierTip spans at most CompactSize from Block0 so the base it anchors is a persist candidate. + StateId tierTip = CreateStateId(_config.CompactSize); _finalizedStateProvider.SetFinalizedBlockNumber(10); - // Create snapshot (compacted or not based on parameter) - using Snapshot expectedSnapshot = CreateSnapshot(persisted, target, compacted: useCompacted); + // CreateSnapshot registers the snapshot's To, so GetLastSnapshotId returns tierTip and the backstop + // seeds on it; emulate a one-hop graph by registering a base at tierTip with From = Block0. + using Snapshot expected = CreateSnapshot(Block0, tierTip, compacted: false); + + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = _persistenceManager.DetermineSnapshotAction(latest); + + Assert.That(toConvert, Is.Null); + // The backstop seed lands on tierTip; the BFS finds the in-memory base whose From == Block0 + // (currentPersistedState) and returns it as toPersist. + Assert.That(toPersist, Is.Not.Null); + Assert.That(toPersist!.From, Is.EqualTo(Block0)); + Assert.That(toPersist.To, Is.EqualTo(tierTip)); + + toPersist.Dispose(); + } + + // With MinReorgDepth >= the configured backstop, the effective backstop is raised to + // MinReorgDepth + CompactSize, so a depth just past the configured 90000 does NOT force-persist, + // but one past MinReorgDepth + CompactSize does. + [TestCase(90001, false, TestName = "DetermineSnapshotAction_BackstopRaised_BelowMinPlusCompactSize_NoForce")] + [TestCase(90000 + 16 + 1, true, TestName = "DetermineSnapshotAction_BackstopRaised_AboveMinPlusCompactSize_Forces")] + public void DetermineSnapshotAction_BackstopRaisedAboveMinReorgDepth(long latestBlock, bool expectForcedPersist) + { + // MinReorgDepth == configured backstop == 90000, CompactSize 16 → effective backstop 90016. + FlatDbConfig config = new() + { + CompactSize = 16, + MinReorgDepth = 90000, + MaxReorgDepth = 90000, + LongFinalityMaxReorgDepth = 90000, + EnableLongFinality = true, + MaxInMemoryBaseSnapshotCount = 160, + }; + using PersistenceManager pm = new( + config, + ScheduleHelper.CreateWithOffset(config, 0), + _finalizedStateProvider, + _persistence, + _snapshotRepository, + LimboLogs.Instance, + _persistedSnapshotCompactor, + _tier.Loader, + Substitute.For()); + + // Finalized below the next boundary so only the backstop (not the finalized trigger) can fire; + // a registered base at tierTip gives FindSnapshotToPersist a candidate. + StateId tierTip = CreateStateId(config.CompactSize); + using Snapshot expected = CreateSnapshot(Block0, tierTip, compacted: false); + _finalizedStateProvider.SetFinalizedBlockNumber(5); + + (_, Snapshot? toPersist, _) = pm.DetermineSnapshotAction(CreateStateId(latestBlock)); + + Assert.That(toPersist is not null, Is.EqualTo(expectForcedPersist)); + toPersist?.Dispose(); + } + + [Test] + public void DetermineSnapshotAction_FinalizedGatePassesButSeedMissing_BackstopStillForcesPersist() + { + // Regression: with MinReorgDepth == the configured backstop (both 90000), the finalized + // trigger's depth gate (depth + CompactSize > MinReorgDepth) is satisfied across the whole + // operating range above the backstop. When the finalized branch is entered but yields no seed + // (its synthetic boundary root resolves to null here), the backstop must STILL fire — it is an + // independent fallback, not an `else if` shadowed by the always-satisfied finalized depth gate. + // Before the fix this returned no persist candidate, so deep state never persisted. + FlatDbConfig config = new() + { + CompactSize = 16, + MinReorgDepth = 90000, + MaxReorgDepth = 90000, + LongFinalityMaxReorgDepth = 90000, + EnableLongFinality = true, + MaxInMemoryBaseSnapshotCount = 160, + }; + using PersistenceManager pm = new( + config, + ScheduleHelper.CreateWithOffset(config, 0), + _finalizedStateProvider, + _persistence, + _snapshotRepository, + LimboLogs.Instance, + _persistedSnapshotCompactor, + _tier.Loader, + Substitute.For()); + + // Finalized at/above the next boundary so the finalized branch IS entered, but leave + // GetFinalizedStateRootAt(16) unset so its seed resolves to null. Depth (90017) exceeds the + // effective backstop (MinReorgDepth + CompactSize = 90016), so the backstop must persist. + StateId tierTip = CreateStateId(config.CompactSize); + using Snapshot expected = CreateSnapshot(Block0, tierTip, compacted: false); + _finalizedStateProvider.SetFinalizedBlockNumber(90000); + + (_, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = pm.DetermineSnapshotAction(CreateStateId(90017)); + + Assert.That(toPersist, Is.Not.Null, "Backstop must force a persist even when the finalized branch ran but found no seed"); + Assert.That(toPersist!.From, Is.EqualTo(Block0)); + Assert.That(toPersist.To, Is.EqualTo(tierTip)); + Assert.That(toConvert, Is.Null); + toPersist.Dispose(); + } + + [Test] + public void DetermineSnapshotAction_FinalizedBeyondHead_SeedsAtBoundary() + { + // Catch-up sync: CL reports a finalized block far beyond the local chain head. + // GetFinalizedStateRootAt(finalizedBlockNumber) would return null, but the boundary + // block (persisted + CompactSize) IS locally synced, so the canonical-root lookup + // resolves there. Phase 1 must seed at the boundary and persist the boundary snapshot. + StateId persisted = Block0; + StateId latest = CreateStateId(200); + StateId boundary = CreateStateId(_config.CompactSize); + + _finalizedStateProvider.SetFinalizedBlockNumber(25_128_361); + // Deliberately leave GetFinalizedStateRootAt(25_128_361) unset → returns null; + // only the boundary block has a known canonical state root. + _finalizedStateProvider.SetFinalizedStateRootAt(_config.CompactSize, new Hash256(boundary.StateRoot.Bytes)); + + using Snapshot expected = CreateSnapshot(persisted, boundary, compacted: false); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = _persistenceManager.DetermineSnapshotAction(latest); + + Assert.That(toConvert, Is.Null); + Assert.That(toPersist, Is.Not.Null); + Assert.That(toPersist!.From, Is.EqualTo(persisted)); + Assert.That(toPersist.To, Is.EqualTo(boundary)); + + toPersist.Dispose(); + } + + [Test] + public void TryFindSnapshotToConvert_PrefersBoundaryCompactedOverBase() + { + // Phase 2 must globally prefer a CompactSize-wide compacted (→ large repo via Branch A) + // over any in-memory base (→ small repo via Branch B), regardless of block-number + // ordering. Seed an in-memory base at state(1) and a CompactSize-wide (16-wide) compacted + // at state(16) — both have From == Block0 on disk — and assert the compacted is picked. + StateId persisted = Block0; + StateId baseTo = CreateStateId(1); + StateId compactedTo = CreateStateId(16); + + // Base at state(1) — sub-CompactSize; Branch B candidate. + using Snapshot baseSnap = CreateSnapshot(persisted, baseTo, compacted: false); + // 16-wide compacted from Block0 — boundary, should win under the two-pass form. + using Snapshot compactedSnap = CreateSnapshot(persisted, compactedTo, compacted: true); + + PersistenceManager.ConversionCandidate? result = InvokeTryFindSnapshotToConvert(persisted); Assert.That(result, Is.Not.Null); - Assert.That(result!.From, Is.EqualTo(persisted)); - Assert.That(result.To, Is.EqualTo(target)); + Assert.That(result!.Compacted, Is.Not.Null); + Assert.That(result.Compacted!.From, Is.EqualTo(persisted)); + Assert.That(result.Compacted.To, Is.EqualTo(compactedTo)); + Assert.That(result.Base, Is.Null); + + result.Compacted.Dispose(); + } + + [Test] + public void ConvertCompactedRange_BoundaryCompacted_RemovesOnlyConvertedStates_PreservingOutsider() + { + // Branch A converts the in-memory bases spanning the boundary compacted's range, then must + // remove ONLY those gathered states from the in-memory tier. A state outside the gathered + // range (here one below `start`, standing in for a snapshot added concurrently mid-convert) + // must survive — the old bulk RemoveStatesUntil(end) would have wrongly swept it. + StateId compactedFrom = CreateStateId(2); + StateId compactedTo = CreateStateId(2 + _config.CompactSize); // span == CompactSize → Branch A + StateId baseA = CreateStateId(5); + StateId baseB = CreateStateId(10); + StateId outsider = CreateStateId(1); // below start (= compactedFrom.BlockNumber + 1) + + // ConvertCompactedRange persists the gathered snapshot into the real persisted tier. + // The converted/boundary snapshots are disposed by it (via RemoveAndRelease + the + // pre-leased candidate), so they are NOT wrapped in `using`. Only the survivor is. + CreateSnapshot(compactedFrom, compactedTo, compacted: true); + CreateSnapshot(compactedFrom, baseA, compacted: false); + CreateSnapshot(baseA, baseB, compacted: false); + using Snapshot outsiderSnap = CreateSnapshot(Block0, outsider, compacted: false); + + Assert.That(_snapshotRepository.HasState(outsider), Is.True); + + _snapshotRepository.TryLeaseInMemoryState(compactedTo, SnapshotTier.InMemoryCompacted, out Snapshot? compactedForConvert); + InvokeConvertCompactedRange(compactedForConvert!); + + Assert.Multiple(() => + { + Assert.That(_snapshotRepository.HasState(outsider), Is.True, "state below `start` must survive"); + // Gathered states are converted into the persisted tier (so HasState still sees them) but + // must be dropped from the in-memory tier — check in-memory presence via TryLeaseInMemoryState. + Assert.That(_snapshotRepository.TryLeaseInMemoryState(baseA, SnapshotTier.InMemoryBase, out _), Is.False, "baseA removed from the in-memory tier"); + Assert.That(_snapshotRepository.TryLeaseInMemoryState(baseB, SnapshotTier.InMemoryBase, out _), Is.False, "baseB removed from the in-memory tier"); + Assert.That(_snapshotRepository.TryLeaseInMemoryState(compactedTo, SnapshotTier.InMemoryCompacted, out _), Is.False, "boundary compacted removed"); + }); + } + + [Test] + public async Task AddToPersistence_InMemoryPersist_PrunesPersistedTier() + { + // Persisting an in-memory snapshot must trigger RemoveStatesUntil on both tier repos so + // superseded tier entries get cleared — the toPersist branch must prune, not only the + // persistedToPersist branch. + StateId from = Block0; + StateId to = CreateStateId(16); + StateId latest = CreateStateId(100); + + // AddToPersistence persists then prunes this in-memory snapshot, so the repo owns its disposal. + _ = CreateSnapshot(from, to, compacted: true); + + // A persisted entry below the new persisted block must be pruned by the persist. + StateId stale = CreateStateId(8); + PersistBase(Block0, stale); + Assert.That(_snapshotRepository.HasBaseSnapshot(stale), Is.True); + + _finalizedStateProvider.SetFinalizedBlockNumber(16); + _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(to.StateRoot.Bytes)); + + IPersistence.IWriteBatch writeBatch = Substitute.For(); + _persistence.CreateWriteBatch(Arg.Any(), Arg.Any()).Returns(writeBatch); + + await _persistenceManager.AddToPersistence(latest); + + // Persisting the in-memory snapshot at `to` must prune the persisted tier below `to`. + Assert.That(_snapshotRepository.HasBaseSnapshot(stale), Is.False); + } + + [Test] + public async Task AddToPersistence_TierSourcePersist_PrunesPersistedTier() + { + // Sibling of AddToPersistence_InMemoryPersist_PrunesPersistedTier for the persistedToPersist + // branch. Tier-source persists must also drive RemoveStatesUntil so superseded entries are cleared. + StateId target = CreateStateId(16); + StateId latest = CreateStateId(100); + _finalizedStateProvider.SetFinalizedBlockNumber(16); + _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(target.StateRoot.Bytes)); + + // No in-memory snapshot — DetermineSnapshotAction takes the tier-fallback path and persists + // the base in the persisted tier whose From == the current persisted state (Block0). + PersistBase(Block0, target); + // A persisted entry below `target` must be pruned by the persist. + StateId stale = CreateStateId(8); + PersistBase(Block0, stale); + + IPersistence.IWriteBatch writeBatch = Substitute.For(); + _persistence.CreateWriteBatch(Arg.Any(), Arg.Any()).Returns(writeBatch); + + await _persistenceManager.AddToPersistence(latest); - result.Dispose(); + Assert.That(_snapshotRepository.HasBaseSnapshot(stale), Is.False); } [Test] - public void DetermineSnapshotToPersist_UnfinalizedForkAtBoundary_PersistsHeadReachableFork() + public void DetermineSnapshotAction_UnfinalizedBelowBackstop_ReturnsNull() { - // Two unfinalized forks at the boundary block 16, both starting from Block0. The head's chain runs - // through target2 (the higher root, not the arbitrary "first"). The forced persist must follow the - // head's chain (target2), otherwise persisting target1 would orphan the head. + // Unfinalized (finalized at 10, persisted at 0 — not in range for the CompactSize=16 + // gate) AND in-memory depth (300) below LongFinalityMaxReorgDepth (90000): no force-persist, + // no Phase 1 candidate. Phase 2 entry guard (SnapshotCount > 160) also not satisfied with + // a single created snapshot. Action: do nothing. StateId persisted = Block0; - StateId target1 = CreateStateId(16, rootByte: 1); // arbitrary "first" (lowest root) - StateId target2 = CreateStateId(16, rootByte: 2); // on the head's chain - StateId head = CreateStateId(300); + StateId latest = CreateStateId(300); + StateId target = CreateStateId(1); + + _finalizedStateProvider.SetFinalizedBlockNumber(10); + + using Snapshot expectedSnapshot = CreateSnapshot(persisted, target, compacted: false); + + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = _persistenceManager.DetermineSnapshotAction(latest); + + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Null); + Assert.That(toConvert, Is.Null); + } + + [Test] + public void DetermineSnapshotAction_UnfinalizedForkAtBoundary_PersistsHeadReachableFork() + { + // Two unfinalized forks at the boundary block 16, both starting from Block0. The committed head's + // chain runs through target2, not the arbitrary target1. The backstop force-persist must follow the + // committed head's chain (target2) — persisting target1 would orphan the head. + StateId persisted = Block0; + StateId target1 = CreateStateId(16, rootByte: 1); // off-chain fork + StateId target2 = CreateStateId(16, rootByte: 2); // on the committed head's chain + StateId head = CreateStateId(95000); // depth > LongFinalityMaxReorgDepth (90000) → backstop fires _finalizedStateProvider.SetFinalizedBlockNumber(10); // unfinalized at the boundary @@ -200,68 +517,86 @@ public void DetermineSnapshotToPersist_UnfinalizedForkAtBoundary_PersistsHeadRea using Snapshot toHead = CreateSnapshot(target2, head, compacted: true); // head reachable only via target2 _snapshotRepository.SetLastCommittedStateId(head); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(head); + (_, Snapshot? toPersist, _) = _persistenceManager.DetermineSnapshotAction(head); - Assert.That(result, Is.Not.Null); - Assert.That(result!.From, Is.EqualTo(persisted)); - Assert.That(result.To, Is.EqualTo(target2)); + Assert.That(toPersist, Is.Not.Null); + Assert.That(toPersist!.From, Is.EqualTo(persisted)); + Assert.That(toPersist.To, Is.EqualTo(target2)); - result.Dispose(); + toPersist.Dispose(); } [Test] - public void DetermineSnapshotToPersist_LongerNonCanonicalFork_PersistsCommittedHeadChain() + public void DetermineSnapshotAction_LongerNonCanonicalFork_PersistsCommittedHeadChain() { - // The longest in-memory chain runs through target1 up to block 300, but the committed head is the - // shorter chain through target2 (at block 32). The forced persist must follow the committed head - // (target2), not the longer fork (target1) that GetLastSnapshotId would pick. + // The longest in-memory chain runs through target1 (longHead is the max, so GetLastSnapshotId would + // pick it), but the committed head is the shorter chain through target2. The backstop must follow the + // committed head (target2), not the longer fork (target1) that the GetLastSnapshotId fallback would pick. StateId persisted = Block0; StateId target1 = CreateStateId(16, rootByte: 1); // boundary state on the longer, non-canonical fork StateId target2 = CreateStateId(16, rootByte: 2); // boundary state on the committed head's chain - StateId longHead = CreateStateId(300); // longest chain (the max), but not committed - StateId committedHead = CreateStateId(32, rootByte: 2); + StateId longHead = CreateStateId(95001, rootByte: 1); // longest chain, but not committed + StateId committedHead = CreateStateId(95000, rootByte: 2); _finalizedStateProvider.SetFinalizedBlockNumber(0); // unfinalized at the boundary - using Snapshot fork1 = CreateSnapshot(persisted, target1, compacted: true); + // longHead (block 95001) is the max, so the GetLastSnapshotId fallback would pick the longer fork — + // only honouring the committed head selects target2. using Snapshot fork2 = CreateSnapshot(persisted, target2, compacted: true); - using Snapshot toLongHead = CreateSnapshot(target1, longHead, compacted: true); // makes target1 the max chain using Snapshot toCommittedHead = CreateSnapshot(target2, committedHead, compacted: true); + using Snapshot fork1 = CreateSnapshot(persisted, target1, compacted: true); + using Snapshot toLongHead = CreateSnapshot(target1, longHead, compacted: true); _snapshotRepository.SetLastCommittedStateId(committedHead); - // latestSnapshot at 300 (the longest chain) makes the in-memory depth exceed MaxReorgDepth (256), - // triggering the force-persist branch. - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(longHead); + // latestSnapshot at the longest chain makes the in-memory depth exceed LongFinalityMaxReorgDepth, triggering the + // force-persist (backstop) branch. + (_, Snapshot? toPersist, _) = _persistenceManager.DetermineSnapshotAction(longHead); - Assert.That(result, Is.Not.Null); - Assert.That(result!.From, Is.EqualTo(persisted)); - Assert.That(result.To, Is.EqualTo(target2)); + Assert.That(toPersist, Is.Not.Null); + Assert.That(toPersist!.From, Is.EqualTo(persisted)); + Assert.That(toPersist.To, Is.EqualTo(target2)); - result.Dispose(); + toPersist.Dispose(); } - #endregion - - #region Edge Cases - [Test] - public void DetermineSnapshotToPersist_NoSnapshotAvailable_ReturnsNull() + public void DetermineSnapshotAction_NoSnapshotAvailable_ReturnsNull() { - // Setup: sufficient depth but no snapshots in repository StateId persisted = Block0; StateId latest = CreateStateId(100); _finalizedStateProvider.SetFinalizedBlockNumber(100); _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(CreateStateId(16).StateRoot.Bytes)); - // Don't create any snapshots + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, _) = _persistenceManager.DetermineSnapshotAction(latest); + + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Null); + } + + [Test] + public void DetermineSnapshotAction_FinalizedNoInMemory_FallsBackToPersistedSnapshot() + { + // Setup: persisted at Block0, latest at 100, finalized at 16 — the BFS seeds with the + // finalized state, which corresponds exactly to the persisted snapshot we mock below. + StateId latest = CreateStateId(100); + StateId target = CreateStateId(16); + _finalizedStateProvider.SetFinalizedBlockNumber(16); + _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(target.StateRoot.Bytes)); + + // Don't create any in-memory snapshots — persist a base into the tier so the fallback finds it. + PersistBase(Block0, target); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, PersistenceManager.ConversionCandidate? toConvert) = _persistenceManager.DetermineSnapshotAction(latest); - Assert.That(result, Is.Null); + Assert.That(persistedToPersist, Is.Not.Null); + Assert.That(toPersist, Is.Null); + Assert.That(toConvert, Is.Null); + + persistedToPersist!.Dispose(); } [Test] - public void DetermineSnapshotToPersist_SnapshotWithWrongFromState_ReturnsNull() + public void DetermineSnapshotAction_SnapshotWithWrongFromState_ReturnsNull() { // Setup: snapshot exists but doesn't start from current persisted state StateId persisted = Block0; @@ -271,73 +606,75 @@ public void DetermineSnapshotToPersist_SnapshotWithWrongFromState_ReturnsNull() _finalizedStateProvider.SetFinalizedBlockNumber(100); _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(target.StateRoot.Bytes)); - // Create snapshot with wrong "from" state using Snapshot wrongSnapshot = CreateSnapshot(wrongFrom, target, compacted: true); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, _) = _persistenceManager.DetermineSnapshotAction(latest); - Assert.That(result, Is.Null); + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Null); } [Test] - public void DetermineSnapshotToPersist_MultipleStatesAtBlock_SelectsCorrectOne() + public void DetermineSnapshotAction_MultipleStatesAtBlock_SelectsCorrectOne() { - // Setup: multiple state roots at same block number (reorg scenario) + // Setup: multiple state roots at same block number (reorg scenario). Set finalized at the + // candidate block so the single-seed BFS lands directly on the finalized state root. StateId persisted = Block0; StateId latest = CreateStateId(100); StateId target1 = CreateStateId(16, rootByte: 1); - StateId target2 = CreateStateId(16, rootByte: 2); // Different root - _finalizedStateProvider.SetFinalizedBlockNumber(100); - _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(target2.StateRoot.Bytes)); // target2 is finalized + StateId target2 = CreateStateId(16, rootByte: 2); + _finalizedStateProvider.SetFinalizedBlockNumber(16); + _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(target2.StateRoot.Bytes)); - // Create both snapshots using Snapshot snapshot1 = CreateSnapshot(persisted, target1, compacted: true); using Snapshot snapshot2 = CreateSnapshot(persisted, target2, compacted: true); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, _) = _persistenceManager.DetermineSnapshotAction(latest); - Assert.That(result, Is.Not.Null); - Assert.That(result!.To.StateRoot.Bytes.ToArray(), Is.EqualTo(target2.StateRoot.Bytes.ToArray())); // Should select finalized one + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Not.Null); + Assert.That(toPersist!.To.StateRoot.Bytes.ToArray(), Is.EqualTo(target2.StateRoot.Bytes.ToArray())); - result.Dispose(); + toPersist.Dispose(); } [Test] - public void DetermineSnapshotToPersist_ExactlyAtMinimumBoundary_ReturnsNull() + public void DetermineSnapshotAction_ExactlyAtMinimumBoundary_ReturnsNull() { - // Setup: persisted at Block0 (0), latest at 79 - // After persist would be at 15, leaving depth of 64 (exactly at minimum boundary) + // Gate passes (79+16=95 > 64), but GetFinalizedStateRootAt(16) is not configured → + // returns null → seed = null. No backstop (79 << LongFinalityMaxReorgDepth). Result: null. StateId persisted = Block0; StateId latest = CreateStateId(79); _finalizedStateProvider.SetFinalizedBlockNumber(100); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, _) = _persistenceManager.DetermineSnapshotAction(latest); - Assert.That(result, Is.Null); + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Null); } [Test] - public void DetermineSnapshotToPersist_OneAboveMinimumBoundary_ReturnsSnapshot() + public void DetermineSnapshotAction_OneAboveMinimumBoundary_ReturnsSnapshot() { - // Setup: persisted at Block0 (0), latest at 80 - // After persist would be at 15, leaving depth of 65 (one above minimum boundary) + // Setup: persisted at Block0, latest at 80, finalized at the candidate block (16) so the + // single-seed BFS lands directly on it. Depth (80) + CompactSize (16) = 96 > MinReorgDepth + // (64) — passes the normal-trigger gate. StateId persisted = Block0; StateId latest = CreateStateId(80); StateId target = CreateStateId(16); - _finalizedStateProvider.SetFinalizedBlockNumber(100); + _finalizedStateProvider.SetFinalizedBlockNumber(16); _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(target.StateRoot.Bytes)); using Snapshot expectedSnapshot = CreateSnapshot(persisted, target, compacted: true); - Snapshot? result = _persistenceManager.DetermineSnapshotToPersist(latest); + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, _) = _persistenceManager.DetermineSnapshotAction(latest); - Assert.That(result, Is.Not.Null); + Assert.That(persistedToPersist, Is.Null); + Assert.That(toPersist, Is.Not.Null); - result!.Dispose(); + toPersist!.Dispose(); } - #endregion - #region PersistSnapshot Tests [Test] @@ -414,71 +751,29 @@ public void PersistSnapshot_EmptySnapshot_CreatesWriteBatch() #endregion - #region AddToPersistence Tests - [Test] - public void AddToPersistence_WithAvailableSnapshot_PersistsAndUpdatesState() + public async Task AddToPersistence_WithAvailableSnapshot_PersistsAndUpdatesState() { - // Arrange + // Finalized at the candidate block so the single-seed BFS lands directly on it. StateId from = Block0; StateId to = CreateStateId(16); StateId latest = CreateStateId(100); - // Create a snapshot that should be persisted - using Snapshot snapshot = CreateSnapshot(from, to, compacted: true); + // AddToPersistence persists then prunes this in-memory snapshot, so the repo owns its disposal. + _ = CreateSnapshot(from, to, compacted: true); - _finalizedStateProvider.SetFinalizedBlockNumber(100); + _finalizedStateProvider.SetFinalizedBlockNumber(16); _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(to.StateRoot.Bytes)); IPersistence.IWriteBatch writeBatch = Substitute.For(); _persistence.CreateWriteBatch(Arg.Any(), Arg.Any()).Returns(writeBatch); - // Act - _persistenceManager.AddToPersistence(latest); + await _persistenceManager.AddToPersistence(latest); - // Assert - // Verify write batch was created (persistence happened) _persistence.Received().CreateWriteBatch(from, to); - - // Verify current persisted state was updated Assert.That(_persistenceManager.GetCurrentPersistedStateId(), Is.EqualTo(to)); } - #endregion - - #region Offset Behavior - - [TestCase(3, 13)] - [TestCase(5, 11)] - [TestCase(0, 16)] - public void DetermineSnapshotToPersist_WithOffset_FirstBoundaryShifted(int offset, int expectedTargetBlock) - { - // Fresh DB: currentPersistedState = Block0 (block 0). - // With CompactSize=16 and offset=N, the next full compaction boundary is at block 16-N. - PersistenceManager pm = new( - _config, - ScheduleHelper.CreateWithOffset(_config, offset), - _finalizedStateProvider, - _persistence, - _snapshotRepository, - LimboLogs.Instance); - - StateId target = CreateStateId(expectedTargetBlock); - StateId latest = CreateStateId(200); - _finalizedStateProvider.SetFinalizedBlockNumber(200); - _finalizedStateProvider.SetFinalizedStateRootAt(expectedTargetBlock, new Hash256(target.StateRoot.Bytes)); - - using Snapshot expected = CreateSnapshot(Block0, target, compacted: true); - - Snapshot? result = pm.DetermineSnapshotToPersist(latest); - - Assert.That(result, Is.Not.Null); - Assert.That(result!.To, Is.EqualTo(target)); - result.Dispose(); - } - - #endregion - #region FlushToPersistence Tests [Test] @@ -497,7 +792,6 @@ public void FlushToPersistence_NoSnapshots_ReturnsCurrentPersistedState() [Test] public void FlushToPersistence_WithFinalizedSnapshots_PersistsFinalizedFirst() { - // Arrange StateId state16 = CreateStateId(16); StateId state32 = CreateStateId(32); @@ -505,16 +799,15 @@ public void FlushToPersistence_WithFinalizedSnapshots_PersistsFinalizedFirst() _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(state16.StateRoot.Bytes)); _finalizedStateProvider.SetFinalizedStateRootAt(32, new Hash256(state32.StateRoot.Bytes)); - using Snapshot snapshot1 = CreateSnapshot(Block0, state16, compacted: true); - using Snapshot snapshot2 = CreateSnapshot(state16, state32, compacted: true); + // Repo-owned; FlushToPersistence prunes (disposes) them once persisted, so don't double-own. + CreateSnapshot(Block0, state16, compacted: true); + CreateSnapshot(state16, state32, compacted: true); IPersistence.IWriteBatch writeBatch = Substitute.For(); _persistence.CreateWriteBatch(Arg.Any(), Arg.Any()).Returns(writeBatch); - // Act StateId result = _persistenceManager.FlushToPersistence(); - // Assert Assert.That(result, Is.EqualTo(state32)); _persistence.Received().CreateWriteBatch(Block0, state16); _persistence.Received().CreateWriteBatch(state16, state32); @@ -523,19 +816,17 @@ public void FlushToPersistence_WithFinalizedSnapshots_PersistsFinalizedFirst() [Test] public void FlushToPersistence_WithUnfinalizedSnapshots_FallsBackToFirstAvailable() { - // Arrange - no finalization info available StateId state16 = CreateStateId(16); - _finalizedStateProvider.SetFinalizedBlockNumber(0); // Nothing finalized + _finalizedStateProvider.SetFinalizedBlockNumber(0); - using Snapshot snapshot = CreateSnapshot(Block0, state16, compacted: true); + // Repo-owned; FlushToPersistence prunes (disposes) it once persisted, so don't double-own. + CreateSnapshot(Block0, state16, compacted: true); IPersistence.IWriteBatch writeBatch = Substitute.For(); _persistence.CreateWriteBatch(Arg.Any(), Arg.Any()).Returns(writeBatch); - // Act StateId result = _persistenceManager.FlushToPersistence(); - // Assert Assert.That(result, Is.EqualTo(state16)); _persistence.Received().CreateWriteBatch(Block0, state16); } @@ -551,9 +842,10 @@ public void FlushToPersistence_UnfinalizedForkAtBoundary_PersistsHeadReachableFo _finalizedStateProvider.SetFinalizedBlockNumber(0); // nothing finalized - using Snapshot fork1 = CreateSnapshot(Block0, target1, compacted: true); - using Snapshot fork2 = CreateSnapshot(Block0, target2, compacted: true); - using Snapshot toHead = CreateSnapshot(target2, head, compacted: true); // head reachable only via target2 + // Repo-owned; FlushToPersistence persists/prunes (disposes) them, so don't double-own. + CreateSnapshot(Block0, target1, compacted: true); + CreateSnapshot(Block0, target2, compacted: true); + CreateSnapshot(target2, head, compacted: true); // head reachable only via target2 _snapshotRepository.SetLastCommittedStateId(head); IPersistence.IWriteBatch writeBatch = Substitute.For(); @@ -579,11 +871,11 @@ public void FlushToPersistence_LongerNonCanonicalFork_PersistsCommittedHeadChain _finalizedStateProvider.SetFinalizedBlockNumber(0); // nothing finalized - using Snapshot fork1 = CreateSnapshot(Block0, target1, compacted: true); - using Snapshot fork2 = CreateSnapshot(Block0, target2, compacted: true); - // Not `using`: the flush prunes this orphaned non-canonical descendant and disposes it itself. - Snapshot toLongHead = CreateSnapshot(target1, longHead, compacted: true); - using Snapshot toCommittedHead = CreateSnapshot(target2, committedHead, compacted: true); + // Repo-owned; FlushToPersistence persists/prunes (disposes) them, so don't double-own. + CreateSnapshot(Block0, target1, compacted: true); + CreateSnapshot(Block0, target2, compacted: true); + CreateSnapshot(target1, longHead, compacted: true); + CreateSnapshot(target2, committedHead, compacted: true); _snapshotRepository.SetLastCommittedStateId(committedHead); IPersistence.IWriteBatch writeBatch = Substitute.For(); @@ -599,49 +891,45 @@ public void FlushToPersistence_LongerNonCanonicalFork_PersistsCommittedHeadChain [Test] public void FlushToPersistence_PrefersFinalizedOverUnfinalized() { - // Arrange - two snapshots at same block, one finalized + // Two snapshots at the same block, one finalized. Set finalized block to the + // candidate block so the BFS seed lands directly on the finalized state. StateId finalizedState = CreateStateId(16, rootByte: 1); StateId unfinalizedState = CreateStateId(16, rootByte: 2); - _finalizedStateProvider.SetFinalizedBlockNumber(100); + _finalizedStateProvider.SetFinalizedBlockNumber(16); _finalizedStateProvider.SetFinalizedStateRootAt(16, new Hash256(finalizedState.StateRoot.Bytes)); - // Create both snapshots - using Snapshot finalizedSnapshot = CreateSnapshot(Block0, finalizedState, compacted: true); - using Snapshot unfinalizedSnapshot = CreateSnapshot(Block0, unfinalizedState, compacted: true); + // Repo-owned; FlushToPersistence prunes (disposes) them once persisted, so don't double-own. + CreateSnapshot(Block0, finalizedState, compacted: true); + CreateSnapshot(Block0, unfinalizedState, compacted: true); IPersistence.IWriteBatch writeBatch = Substitute.For(); _persistence.CreateWriteBatch(Arg.Any(), Arg.Any()).Returns(writeBatch); - // Act StateId result = _persistenceManager.FlushToPersistence(); - // Assert - should persist finalized state Assert.That(result.StateRoot.Bytes.ToArray(), Is.EqualTo(finalizedState.StateRoot.Bytes.ToArray())); } [Test] public void FlushToPersistence_PersistsMultipleSnapshots_InOrder() { - // Arrange StateId state1 = CreateStateId(1); StateId state2 = CreateStateId(2); StateId state3 = CreateStateId(3); - // No finalization - will use first available _finalizedStateProvider.SetFinalizedBlockNumber(0); - using Snapshot snapshot1 = CreateSnapshot(Block0, state1, compacted: false); - using Snapshot snapshot2 = CreateSnapshot(state1, state2, compacted: false); - using Snapshot snapshot3 = CreateSnapshot(state2, state3, compacted: false); + // Repo-owned; FlushToPersistence prunes (disposes) them once persisted, so don't double-own. + CreateSnapshot(Block0, state1, compacted: false); + CreateSnapshot(state1, state2, compacted: false); + CreateSnapshot(state2, state3, compacted: false); IPersistence.IWriteBatch writeBatch = Substitute.For(); _persistence.CreateWriteBatch(Arg.Any(), Arg.Any()).Returns(writeBatch); - // Act StateId result = _persistenceManager.FlushToPersistence(); - // Assert Assert.That(result, Is.EqualTo(state3)); Received.InOrder(() => { @@ -651,8 +939,51 @@ public void FlushToPersistence_PersistsMultipleSnapshots_InOrder() }); } + [Test] + public void FlushToPersistence_PersistedOnlyTier_WalksAndPrunes() + { + // No in-memory snapshot above the persisted point and nothing finalized: the flush must + // still reach the persisted-tier backlog via the tier-aware latest tip (GetLastSnapshotId + // folds in the persisted maxes) and prune entries the persist supersedes. Regression for + // FlushToPersistence early-returning on a persisted-only tier and never pruning it. + StateId target = CreateStateId(16); + StateId stale = CreateStateId(8); + + PersistBase(Block0, stale); + PersistBase(Block0, target); + + IPersistence.IWriteBatch writeBatch = Substitute.For(); + _persistence.CreateWriteBatch(Arg.Any(), Arg.Any()).Returns(writeBatch); + + StateId result = _persistenceManager.FlushToPersistence(); + + Assert.That(result, Is.EqualTo(target)); + _persistence.Received().CreateWriteBatch(Block0, target); + Assert.That(_snapshotRepository.HasBaseSnapshot(stale), Is.False); + } + #endregion + private PersistenceManager.ConversionCandidate? InvokeTryFindSnapshotToConvert(StateId currentPersistedState) + { + // TryFindSnapshotToConvert is private; reach it via reflection so we can unit-test the + // priority logic without driving the full DetermineSnapshotAction → AddToPersistence loop. + System.Reflection.MethodInfo method = typeof(PersistenceManager).GetMethod( + "TryFindSnapshotToConvert", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!; + return (PersistenceManager.ConversionCandidate?)method.Invoke(_persistenceManager, [currentPersistedState]); + } + + private void InvokeConvertCompactedRange(Snapshot compacted) + { + // ConvertCompactedRange is private; reach it via reflection to unit-test the in-memory + // removal logic directly without driving the full DetermineSnapshotAction → AddToPersistence loop. + System.Reflection.MethodInfo method = typeof(PersistenceManager).GetMethod( + "ConvertCompactedRange", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!; + method.Invoke(_persistenceManager, [compacted]); + } + #region Helper Classes private class TestFinalizedStateProvider : IFinalizedStateProvider diff --git a/src/Nethermind/Nethermind.State.Flat.Test/ReadOnlySnapshotBundlePersistedTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/ReadOnlySnapshotBundlePersistedTests.cs new file mode 100644 index 000000000000..3cca0a96144c --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/ReadOnlySnapshotBundlePersistedTests.cs @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.IO; +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Db; +using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.Trie; +using NSubstitute; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +[TestFixture] +public class ReadOnlySnapshotBundlePersistedTests +{ + private ResourcePool _pool = null!; + private ArenaManager _memArena = null!; + private string _memArenaDir = null!; + private BlobArenaManager _blobs = null!; + private string _blobsDir = null!; + + [SetUp] + public void SetUp() + { + _pool = new ResourcePool(new FlatDbConfig()); + _memArenaDir = Path.Combine(Path.GetTempPath(), $"nm-robtest-arena-{Guid.NewGuid():N}"); + _memArena = TestFixtureHelpers.CreateArenaManager(_memArenaDir); + _blobsDir = Path.Combine(Path.GetTempPath(), $"nm-robtest-blobs-{Guid.NewGuid():N}"); + _blobs = new BlobArenaManager(_blobsDir, 4L * 1024 * 1024); + } + + [TearDown] + public void TearDown() + { + _blobs.Dispose(); + _memArena.Dispose(); + try { Directory.Delete(_blobsDir, recursive: true); } catch { /* best-effort */ } + try { Directory.Delete(_memArenaDir, recursive: true); } catch { /* best-effort */ } + } + + [Test] + public void TryLoadStateRlp_ReturnsFromPersistedSnapshot_BeforePersistence() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + + TreePath path = new(Keccak.Compute("path"), 4); + byte[] nodeRlp = [0xC2, 0x80, 0x80]; + + SnapshotContent content = new(); + content.StateNodes[path] = new TrieNode(NodeType.Leaf, nodeRlp); + Snapshot snap = new(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing); + byte[] tableData = PersistedSnapshotBuilderTestExtensions.Build(snap, _blobs); + + PersistedSnapshot persisted = CreatePersistedSnapshot(s0, s1, tableData); + PersistedSnapshotList list = new(1) { persisted }; + + IPersistence.IPersistenceReader reader = Substitute.For(); + + using ReadOnlySnapshotBundle bundle = new( + new SnapshotPooledList(0), + reader, + recordDetailedMetrics: false, + persistedSnapshots: AlwaysTrueStack(list)); + + byte[]? result = bundle.TryLoadStateRlp(path, Keccak.Compute("hash"), ReadFlags.None); + + Assert.That(result, Is.EqualTo(nodeRlp)); + reader.DidNotReceive().TryLoadStateRlp(Arg.Any(), Arg.Any()); + } + + [Test] + public void TryLoadStorageRlp_ReturnsFromPersistedSnapshot_BeforePersistence() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + + Hash256 address = Keccak.Compute("address"); + TreePath path = new(Keccak.Compute("path"), 6); + byte[] nodeRlp = [0xC1, 0x80]; + + SnapshotContent content = new(); + content.StorageNodes[(address, path)] = new TrieNode(NodeType.Branch, nodeRlp); + Snapshot snap = new(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing); + byte[] tableData = PersistedSnapshotBuilderTestExtensions.Build(snap, _blobs); + + PersistedSnapshot persisted = CreatePersistedSnapshot(s0, s1, tableData); + PersistedSnapshotList list = new(1) { persisted }; + + IPersistence.IPersistenceReader reader = Substitute.For(); + + using ReadOnlySnapshotBundle bundle = new( + new SnapshotPooledList(0), + reader, + recordDetailedMetrics: false, + persistedSnapshots: AlwaysTrueStack(list)); + + byte[]? result = bundle.TryLoadStorageRlp(address, path, Keccak.Compute("hash"), ReadFlags.None); + + Assert.That(result, Is.EqualTo(nodeRlp)); + reader.DidNotReceive().TryLoadStorageRlp(Arg.Any(), Arg.Any(), Arg.Any()); + } + + [Test] + public void TryLoadStateRlp_FallsThrough_WhenNotInPersistedSnapshot() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s1 = new(1, Keccak.Compute("1")); + + TreePath storedPath = new(Keccak.Compute("stored"), 4); + TreePath missingPath = new(Keccak.Compute("missing"), 3); + byte[] nodeRlp = [0xC0]; + byte[] dbRlp = [0xC1, 0x80, 0x80]; + + SnapshotContent content = new(); + content.StateNodes[storedPath] = new TrieNode(NodeType.Leaf, nodeRlp); + Snapshot snap = new(s0, s1, content, _pool, ResourcePool.Usage.MainBlockProcessing); + byte[] tableData = PersistedSnapshotBuilderTestExtensions.Build(snap, _blobs); + + PersistedSnapshot persisted = CreatePersistedSnapshot(s0, s1, tableData); + PersistedSnapshotList list = new(1) { persisted }; + + IPersistence.IPersistenceReader reader = Substitute.For(); + reader.TryLoadStateRlp(Arg.Any(), Arg.Any()).Returns(dbRlp); + + using ReadOnlySnapshotBundle bundle = new( + new SnapshotPooledList(0), + reader, + recordDetailedMetrics: false, + persistedSnapshots: AlwaysTrueStack(list)); + + byte[]? result = bundle.TryLoadStateRlp(missingPath, Keccak.Compute("hash"), ReadFlags.None); + + Assert.That(result, Is.EqualTo(dbRlp)); + reader.Received(1).TryLoadStateRlp(Arg.Any(), Arg.Any()); + } + + [Test] + public void TryLoadStateRlp_WithoutPersistedSnapshots_GoesDirectlyToPersistence() + { + byte[] dbRlp = [0xC0]; + TreePath path = new(Keccak.Compute("path"), 4); + + IPersistence.IPersistenceReader reader = Substitute.For(); + reader.TryLoadStateRlp(Arg.Any(), Arg.Any()).Returns(dbRlp); + + using ReadOnlySnapshotBundle bundle = new( + new SnapshotPooledList(0), + reader, + recordDetailedMetrics: false, + persistedSnapshots: PersistedSnapshotStack.Empty()); + + byte[]? result = bundle.TryLoadStateRlp(path, Keccak.Compute("hash"), ReadFlags.None); + + Assert.That(result, Is.EqualTo(dbRlp)); + reader.Received(1).TryLoadStateRlp(Arg.Any(), Arg.Any()); + } + + // Each test snapshot is constructed without a bloom, so it carries the AlwaysTrue + // placeholder — the stack probes every snapshot unfiltered, which is what these tests want. + private static PersistedSnapshotStack AlwaysTrueStack(PersistedSnapshotList list) => + new(list, recordDetailedMetrics: false); + + private PersistedSnapshot CreatePersistedSnapshot(StateId from, StateId to, byte[] data) => + TestFixtureHelpers.CreatePersistedSnapshot(_memArena, _blobs, from, to, data); +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/ReadOnlySnapshotBundleTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/ReadOnlySnapshotBundleTests.cs index bb9f720245ce..d0118b84999d 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/ReadOnlySnapshotBundleTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/ReadOnlySnapshotBundleTests.cs @@ -9,6 +9,7 @@ using Nethermind.Db; using Nethermind.Int256; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.Trie; using NSubstitute; using NUnit.Framework; @@ -27,7 +28,8 @@ private Snapshot MakeSnapshot(Action? populate = null) => FlatTestHelpers.MakeSnapshot(_pool, populate); private static ReadOnlySnapshotBundle Bundle(SnapshotPooledList snapshots, IPersistence.IPersistenceReader? reader = null, bool recordDetailedMetrics = false) => - new(snapshots, reader ?? Substitute.For(), recordDetailedMetrics); + new(snapshots, reader ?? Substitute.For(), recordDetailedMetrics, + PersistedSnapshotStack.Empty(recordDetailedMetrics)); [TestCase(true)] [TestCase(false)] diff --git a/src/Nethermind/Nethermind.State.Flat.Test/SnapshotCompactorTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/SnapshotCompactorTests.cs index 3b89ece9484b..5f62ef92f184 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/SnapshotCompactorTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/SnapshotCompactorTests.cs @@ -20,6 +20,7 @@ public class SnapshotCompactorTests private SnapshotCompactor _compactor = null!; private ResourcePool _resourcePool = null!; private FlatDbConfig _config = null!; + private FlatTestContainer _tier = null!; private SnapshotRepository _snapshotRepository; [SetUp] @@ -27,10 +28,14 @@ public void SetUp() { _config = new FlatDbConfig { CompactSize = 16 }; _resourcePool = new ResourcePool(_config); - _snapshotRepository = new SnapshotRepository(LimboLogs.Instance); + _tier = new FlatTestContainer(); + _snapshotRepository = _tier.Repository; _compactor = new SnapshotCompactor(_config, ScheduleHelper.CreateWithOffset(_config, 0), _resourcePool, _snapshotRepository, LimboLogs.Instance); } + [TearDown] + public void TearDown() => _tier.Dispose(); + private static StateId CreateStateId(long blockNumber, byte rootByte = 0) { byte[] bytes = new byte[32]; @@ -46,7 +51,7 @@ private void BuildSnapshotChain(long startBlock, long endBlock) StateId to = CreateStateId(i + 1); Snapshot snapshot = _resourcePool.CreateSnapshot(from, to, ResourcePool.Usage.ReadOnlyProcessingEnv); - bool added = _snapshotRepository.TryAddSnapshot(snapshot); + bool added = _snapshotRepository.TryAdd(snapshot, SnapshotTier.InMemoryBase); Assert.That(added, Is.True, $"Failed to add snapshot {i}->{i + 1}"); _snapshotRepository.AddStateId(to); } @@ -274,9 +279,9 @@ public void CompactSnapshotBundle_SelfDestructedAddress_RemovesStorageAndNodes() using Snapshot compacted = _compactor.CompactSnapshotBundle(snapshots); // Self-destructed address should be tracked, and its storage cleared + // Storage nodes are not cleared — orphaned nodes are skipped during trie traversal Assert.That(compacted.Content.SelfDestructedStorageAddresses.Count, Is.GreaterThan(0)); Assert.That(compacted.StoragesCount, Is.EqualTo(0)); - Assert.That(compacted.StorageNodesCount, Is.EqualTo(0)); } [Test] @@ -344,12 +349,12 @@ public void CompactSnapshotBundle_UsesMidCompactorUsageNonBoundary() } [Test] - public void Debug_AssembleSnapshotsUntil_Works() + public void Debug_AssembleInMemorySnapshotsForCompaction_Works() { BuildSnapshotChain(0, 4); StateId target = CreateStateId(4); - SnapshotPooledList assembled = _snapshotRepository.AssembleSnapshotsUntil(target, 0, 10); + SnapshotPooledList assembled = _snapshotRepository.AssembleInMemorySnapshotsForCompaction(target, 0, 10); Assert.That(assembled.Count, Is.EqualTo(4)); @@ -399,14 +404,12 @@ public void GetSnapshotsToCompact_NotCompactionBlock_ReturnsEmpty() [Test] public void GetSnapshotsToCompact_FullCompaction_ReturnsMultipleSnapshots() { - // Build chain of 15 snapshots (0->1, 1->2, ..., 14->15) BuildSnapshotChain(0, 15); - // Add the 16th snapshot (15->16) separately StateId targetFrom = CreateStateId(15); StateId targetTo = CreateStateId(16); Snapshot targetSnapshot = _resourcePool.CreateSnapshot(targetFrom, targetTo, ResourcePool.Usage.ReadOnlyProcessingEnv); - _snapshotRepository.TryAddSnapshot(targetSnapshot); + _snapshotRepository.TryAdd(targetSnapshot, SnapshotTier.InMemoryBase); _snapshotRepository.AddStateId(targetTo); using SnapshotPooledList snapshots = _compactor.GetSnapshotsToCompact(targetSnapshot); @@ -422,7 +425,7 @@ public void GetSnapshotsToCompact_PowerOf2Compaction_ReturnsCorrectCount(long bl BuildSnapshotChain(0, blockNumber); StateId targetTo = CreateStateId(blockNumber); - _snapshotRepository.TryLeaseState(targetTo, out Snapshot? targetSnapshot); + _snapshotRepository.TryLeaseInMemoryState(targetTo, SnapshotTier.InMemoryBase, out Snapshot? targetSnapshot); using SnapshotPooledList snapshots = _compactor.GetSnapshotsToCompact(targetSnapshot!); @@ -436,7 +439,7 @@ public void GetSnapshotsToCompact_SingleSnapshot_ReturnsEmpty() StateId from = new(0, Keccak.Zero); StateId to = new(16, Keccak.Zero); Snapshot snapshot = _resourcePool.CreateSnapshot(from, to, ResourcePool.Usage.ReadOnlyProcessingEnv); - _snapshotRepository.TryAddSnapshot(snapshot); + _snapshotRepository.TryAdd(snapshot, SnapshotTier.InMemoryBase); _snapshotRepository.AddStateId(to); using Snapshot targetSnapshot = _resourcePool.CreateSnapshot(from, to, ResourcePool.Usage.ReadOnlyProcessingEnv); @@ -455,7 +458,7 @@ public void GetSnapshotsToCompact_IncompleteChain_ReturnsEmpty() StateId from = new(i, Keccak.Zero); StateId to = new(i + 1, Keccak.Zero); Snapshot snapshot = _resourcePool.CreateSnapshot(from, to, ResourcePool.Usage.ReadOnlyProcessingEnv); - _snapshotRepository.TryAddSnapshot(snapshot); + _snapshotRepository.TryAdd(snapshot, SnapshotTier.InMemoryBase); _snapshotRepository.AddStateId(to); } @@ -471,15 +474,13 @@ public void GetSnapshotsToCompact_IncompleteChain_ReturnsEmpty() [Test] public void DoCompactSnapshot_ValidChain_CreatesCompactedSnapshot() { - // Build chain of 15 snapshots (0->1, 1->2, ..., 14->15) BuildSnapshotChain(0, 15); - // Add the 16th snapshot (15->16) separately StateId targetFrom = CreateStateId(15); StateId targetTo = CreateStateId(16); Snapshot targetSnapshot = _resourcePool.CreateSnapshot(targetFrom, targetTo, ResourcePool.Usage.ReadOnlyProcessingEnv); targetSnapshot.Content.Accounts[TestItem.AddressB] = new Account((UInt256)20, (UInt256)2000); - _snapshotRepository.TryAddSnapshot(targetSnapshot); + _snapshotRepository.TryAdd(targetSnapshot, SnapshotTier.InMemoryBase); _snapshotRepository.AddStateId(targetTo); _compactor.DoCompactSnapshot(targetSnapshot.To); @@ -496,7 +497,8 @@ public void Constructor_NonPowerOf2CompactSize_Throws() => public void GetSnapshotsToCompact_Size2Compaction_AllowedByDefault() { FlatDbConfig config = new() { CompactSize = 16 }; - SnapshotRepository repo = new(LimboLogs.Instance); + using FlatTestContainer tier = new(); + SnapshotRepository repo = tier.Repository; SnapshotCompactor compactor = new(config, ScheduleHelper.CreateWithOffset(config, 0), _resourcePool, repo, LimboLogs.Instance); for (long i = 0; i < 2; i++) @@ -504,12 +506,12 @@ public void GetSnapshotsToCompact_Size2Compaction_AllowedByDefault() StateId from = CreateStateId(i); StateId to = CreateStateId(i + 1); Snapshot snapshot = _resourcePool.CreateSnapshot(from, to, ResourcePool.Usage.ReadOnlyProcessingEnv); - repo.TryAddSnapshot(snapshot); + repo.TryAdd(snapshot, SnapshotTier.InMemoryBase); repo.AddStateId(to); } StateId target = CreateStateId(2); - repo.TryLeaseState(target, out Snapshot? targetSnapshot); + repo.TryLeaseInMemoryState(target, SnapshotTier.InMemoryBase, out Snapshot? targetSnapshot); using SnapshotPooledList snapshots = compactor.GetSnapshotsToCompact(targetSnapshot!); @@ -555,7 +557,8 @@ public void GetSnapshotsToCompact_WithOffset_FullCompactionShiftedFromBoundary() // CompactSize=16, offset=3 -> full compaction triggers when (block+3) % 16 == 0, // i.e. at blocks 13, 29, 45, ... Build a chain to block 29 (second full boundary). FlatDbConfig config = new() { CompactSize = 16 }; - SnapshotRepository repo = new(LimboLogs.Instance); + using FlatTestContainer tier = new(); + SnapshotRepository repo = tier.Repository; SnapshotCompactor compactor = new(config, ScheduleHelper.CreateWithOffset(config, 3), _resourcePool, repo, LimboLogs.Instance); for (long i = 0; i < 29; i++) @@ -563,20 +566,20 @@ public void GetSnapshotsToCompact_WithOffset_FullCompactionShiftedFromBoundary() StateId from = CreateStateId(i); StateId to = CreateStateId(i + 1); Snapshot s = _resourcePool.CreateSnapshot(from, to, ResourcePool.Usage.ReadOnlyProcessingEnv); - repo.TryAddSnapshot(s); + repo.TryAdd(s, SnapshotTier.InMemoryBase); repo.AddStateId(to); } // Block 29: (29+3) & -(29+3) = 32 & -32 = 32, capped at CompactSize=16 -> full compaction StateId target29 = CreateStateId(29); - repo.TryLeaseState(target29, out Snapshot? targetSnapshot); + repo.TryLeaseInMemoryState(target29, SnapshotTier.InMemoryBase, out Snapshot? targetSnapshot); using SnapshotPooledList snapshots29 = compactor.GetSnapshotsToCompact(targetSnapshot!); Assert.That(snapshots29.Count, Is.EqualTo(16), "Block 29 should trigger full compaction with offset=3"); targetSnapshot!.Dispose(); // Block 16: (16+3) & -(16+3) = 19 & -19 = 1 -> caller sees compactSize<=1, no compaction StateId target16 = CreateStateId(16); - repo.TryLeaseState(target16, out targetSnapshot); + repo.TryLeaseInMemoryState(target16, SnapshotTier.InMemoryBase, out targetSnapshot); using SnapshotPooledList snapshots16 = compactor.GetSnapshotsToCompact(targetSnapshot!); Assert.That(snapshots16.Count, Is.EqualTo(0), "Block 16 should NOT trigger compaction with offset=3"); targetSnapshot!.Dispose(); @@ -587,7 +590,8 @@ public void CompactSnapshotBundle_WithOffset_UsesCorrectUsageTier() { // CompactSize=16, offset=3. At block 13 the bit trick yields 16 -> Compact16 tier. FlatDbConfig config = new() { CompactSize = 16 }; - SnapshotRepository repo = new(LimboLogs.Instance); + using FlatTestContainer tier = new(); + SnapshotRepository repo = tier.Repository; SnapshotCompactor compactor = new(config, ScheduleHelper.CreateWithOffset(config, 3), _resourcePool, repo, LimboLogs.Instance); StateId from = new(0, Keccak.Zero); diff --git a/src/Nethermind/Nethermind.State.Flat.Test/SnapshotRepositoryTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/SnapshotRepositoryTests.cs index 0a93d7319118..ebc32cd58f2f 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/SnapshotRepositoryTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/SnapshotRepositoryTests.cs @@ -7,7 +7,6 @@ using Nethermind.Core.Crypto; using Nethermind.Core.Test.Builders; using Nethermind.Db; -using Nethermind.Logging; using NUnit.Framework; namespace Nethermind.State.Flat.Test; @@ -15,6 +14,7 @@ namespace Nethermind.State.Flat.Test; [TestFixture] public class SnapshotRepositoryTests { + private FlatTestContainer _tier = null!; private SnapshotRepository _repository = null!; private ResourcePool _resourcePool = null!; private FlatDbConfig _config = null!; @@ -24,9 +24,13 @@ public void SetUp() { _config = new FlatDbConfig { CompactSize = 16 }; _resourcePool = new ResourcePool(_config); - _repository = new SnapshotRepository(LimboLogs.Instance); + _tier = new FlatTestContainer(); + _repository = _tier.Repository; } + [TearDown] + public void TearDown() => _tier.Dispose(); + private StateId CreateStateId(long blockNumber, byte rootByte = 0) { byte[] bytes = new byte[32]; @@ -51,9 +55,7 @@ private Snapshot AddSnapshotToRepository(StateId from, StateId to, bool compacte { Snapshot snapshot = CreateSnapshot(from, to, withData); - bool added = compacted - ? _repository.TryAddCompactedSnapshot(snapshot) - : _repository.TryAddSnapshot(snapshot); + bool added = _repository.TryAdd(snapshot, compacted ? SnapshotTier.InMemoryCompacted : SnapshotTier.InMemoryBase); Assert.That(added, Is.True, $"Failed to add snapshot {from}->{to}"); @@ -66,9 +68,7 @@ private Snapshot AddSnapshotToRepository(StateId from, StateId to, bool compacte } private bool TryLease(StateId state, bool compacted, out Snapshot? snapshot) - => compacted - ? _repository.TryLeaseCompactedState(state, out snapshot) - : _repository.TryLeaseState(state, out snapshot); + => _repository.TryLeaseInMemoryState(state, compacted ? SnapshotTier.InMemoryCompacted : SnapshotTier.InMemoryBase, out snapshot); private List BuildSnapshotChain(long startBlock, long endBlock) { @@ -101,8 +101,9 @@ public void TryAddSnapshot_NewAndDuplicate_BehavesCorrectly([Values] bool compac Snapshot snapshot1 = CreateSnapshot(from, to); Snapshot snapshot2 = CreateSnapshot(from, to); - bool added1 = compacted ? _repository.TryAddCompactedSnapshot(snapshot1) : _repository.TryAddSnapshot(snapshot1); - bool added2 = compacted ? _repository.TryAddCompactedSnapshot(snapshot2) : _repository.TryAddSnapshot(snapshot2); + SnapshotTier tier = compacted ? SnapshotTier.InMemoryCompacted : SnapshotTier.InMemoryBase; + bool added1 = _repository.TryAdd(snapshot1, tier); + bool added2 = _repository.TryAdd(snapshot2, tier); Assert.That(added1, Is.True); Assert.That(added2, Is.False); @@ -118,12 +119,12 @@ public void AddAndRemoveSnapshot_CannotLeaseAfterRemoval() Snapshot snapshot = CreateSnapshot(from, to); _repository.AddStateId(to); - _repository.TryAddSnapshot(snapshot); - bool leasedBefore = _repository.TryLeaseState(to, out Snapshot? leasedSnapshot); + _repository.TryAdd(snapshot, SnapshotTier.InMemoryBase); + bool leasedBefore = _repository.TryLeaseInMemoryState(to, SnapshotTier.InMemoryBase, out Snapshot? leasedSnapshot); leasedSnapshot?.Dispose(); - _repository.RemoveAndReleaseKnownState(to); - bool leasedAfter = _repository.TryLeaseState(to, out _); + _repository.RemoveAndReleaseInMemoryKnownState(to, SnapshotTier.InMemoryBase); + bool leasedAfter = _repository.TryLeaseInMemoryState(to, SnapshotTier.InMemoryBase, out _); Assert.That(leasedBefore, Is.True); Assert.That(leasedAfter, Is.False); @@ -135,18 +136,18 @@ public void RemoveSnapshot_WithActiveLeases_DisposesWhenAllReleased() AddSnapshotToRepository(0, 1); StateId to = CreateStateId(1); - bool leased1 = _repository.TryLeaseState(to, out Snapshot? snapshot1); - bool leased2 = _repository.TryLeaseState(to, out Snapshot? snapshot2); + bool leased1 = _repository.TryLeaseInMemoryState(to, SnapshotTier.InMemoryBase, out Snapshot? snapshot1); + bool leased2 = _repository.TryLeaseInMemoryState(to, SnapshotTier.InMemoryBase, out Snapshot? snapshot2); Assert.That(leased1, Is.True); Assert.That(leased2, Is.True); - _repository.RemoveAndReleaseKnownState(to); + _repository.RemoveAndReleaseInMemoryKnownState(to, SnapshotTier.InMemoryBase); snapshot1!.Dispose(); snapshot2!.Dispose(); - bool leasedAfter = _repository.TryLeaseState(to, out _); + bool leasedAfter = _repository.TryLeaseInMemoryState(to, SnapshotTier.InMemoryBase, out _); Assert.That(leasedAfter, Is.False); } @@ -210,31 +211,31 @@ public void HasState_ExistingAndNonExistent() } [Test] - public void GetSnapshotBeforeStateId_EmptyRepository() + public void GetStatesUpToBlock_EmptyRepository() { StateId target = CreateStateId(10); - ArrayPoolList states = _repository.GetSnapshotBeforeStateId(target); + ArrayPoolList states = _repository.GetStatesUpToBlock(target.BlockNumber); Assert.That(states.Count, Is.EqualTo(0)); states.Dispose(); } [Test] - public void GetSnapshotBeforeStateId_NoStatesBeforeTarget() + public void GetStatesUpToBlock_NoStatesBeforeTarget() { StateId state10 = CreateStateId(10); _repository.AddStateId(state10); StateId target = CreateStateId(5); - ArrayPoolList states = _repository.GetSnapshotBeforeStateId(target); + ArrayPoolList states = _repository.GetStatesUpToBlock(target.BlockNumber); Assert.That(states.Count, Is.EqualTo(0)); states.Dispose(); } [Test] - public void GetSnapshotBeforeStateId_StatesBeforeTarget() + public void GetStatesUpToBlock_StatesBeforeTarget() { StateId state1 = CreateStateId(1); StateId state3 = CreateStateId(3); @@ -249,7 +250,7 @@ public void GetSnapshotBeforeStateId_StatesBeforeTarget() _repository.AddStateId(state10); StateId target = CreateStateId(6); - ArrayPoolList states = _repository.GetSnapshotBeforeStateId(target); + ArrayPoolList states = _repository.GetStatesUpToBlock(target.BlockNumber); Assert.That(states.Count, Is.EqualTo(3)); states.Dispose(); @@ -257,12 +258,11 @@ public void GetSnapshotBeforeStateId_StatesBeforeTarget() [TestCase(-1)] [TestCase(long.MinValue)] - public void GetSnapshotBeforeStateId_NegativeBlockNumber_ReturnsEmpty(long blockNumber) + public void GetStatesUpToBlock_NegativeBlockNumber_ReturnsEmpty(long blockNumber) { _repository.AddStateId(CreateStateId(1)); - StateId target = new(blockNumber, Keccak.EmptyTreeHash); - ArrayPoolList states = _repository.GetSnapshotBeforeStateId(target); + ArrayPoolList states = _repository.GetStatesUpToBlock(blockNumber); Assert.That(states.Count, Is.EqualTo(0)); states.Dispose(); @@ -270,78 +270,126 @@ public void GetSnapshotBeforeStateId_NegativeBlockNumber_ReturnsEmpty(long block #endregion - #region AssembleSnapshotsUntil + #region AssembleInMemorySnapshotsForCompaction [Test] - public void AssembleSnapshotsUntil_EmptyRepository() + public void AssembleInMemorySnapshotsForCompaction_EmptyRepository() { StateId target = CreateStateId(10); - using SnapshotPooledList assembled = _repository.AssembleSnapshotsUntil(target, 0, 10); + using SnapshotPooledList assembled = _repository.AssembleInMemorySnapshotsForCompaction(target, 0, 10); Assert.That(assembled.Count, Is.EqualTo(0)); } [Test] - public void AssembleSnapshotsUntil_SingleSnapshot() + public void AssembleInMemorySnapshotsForCompaction_SingleSnapshot() { AddSnapshotToRepository(0, 1); StateId target = CreateStateId(1); - using SnapshotPooledList assembled = _repository.AssembleSnapshotsUntil(target, 0, 10); + using SnapshotPooledList assembled = _repository.AssembleInMemorySnapshotsForCompaction(target, 0, 10); Assert.That(assembled.Count, Is.EqualTo(1)); Assert.That(assembled[0].To, Is.EqualTo(target)); } [Test] - public void AssembleSnapshotsUntil_LinearChain() + public void AssembleInMemorySnapshotsForCompaction_LinearChain() { BuildSnapshotChain(0, 4); StateId target = CreateStateId(4); - using SnapshotPooledList assembled = _repository.AssembleSnapshotsUntil(target, 0, 10); + using SnapshotPooledList assembled = _repository.AssembleInMemorySnapshotsForCompaction(target, 0, 10); Assert.That(assembled.Count, Is.EqualTo(4)); } [Test] - public void AssembleSnapshotsUntil_StopsAtStartingBlock() + public void AssembleInMemorySnapshotsForCompaction_StopsAtStartingBlock() { BuildSnapshotChain(0, 5); StateId target = CreateStateId(4); - using SnapshotPooledList assembled = _repository.AssembleSnapshotsUntil(target, 2, 10); + using SnapshotPooledList assembled = _repository.AssembleInMemorySnapshotsForCompaction(target, 2, 10); Assert.That(assembled.Count, Is.EqualTo(2)); } [Test] - public void AssembleSnapshotsUntil_PrefersCompacted() + public void AssembleInMemorySnapshotsForCompaction_PrefersCompacted() { StateId from = CreateStateId(0); StateId to = CreateStateId(1); Snapshot compacted = CreateSnapshot(from, to); - _repository.TryAddCompactedSnapshot(compacted); + _repository.TryAdd(compacted, SnapshotTier.InMemoryCompacted); - using SnapshotPooledList assembled = _repository.AssembleSnapshotsUntil(to, 0, 10); + using SnapshotPooledList assembled = _repository.AssembleInMemorySnapshotsForCompaction(to, 0, 10); Assert.That(assembled.Count, Is.EqualTo(1)); } #endregion + #region AssembleSnapshots + + [Test] + public void AssembleSnapshots_PersistedSpanning_BelowTarget_AcceptedAsTerminal() + { + StateId s0 = CreateStateId(0); + StateId s2 = CreateStateId(2); + StateId s5 = CreateStateId(5); + + // A persisted base spanning (s0, s5] — its From is below the target s2. + _tier.ConvertToPersistedBase(CreateSnapshot(s0, s5)).Dispose(); + + using AssembledSnapshotResult result = _repository.AssembleSnapshots(s5, s2, 4); + + Assert.That(result.Persisted.Count, Is.EqualTo(1)); + Assert.That(result.InMemory.Count, Is.EqualTo(0)); + Assert.That(result.Persisted[0].From.BlockNumber, Is.LessThan(s2.BlockNumber)); + } + + [Test] + public void AssembleSnapshots_InMemoryOvershoot_Rejected() + { + StateId s2 = CreateStateId(2); + StateId s5 = CreateStateId(5); + + AddSnapshotToRepository(0, 5, compacted: true); + + using AssembledSnapshotResult result = _repository.AssembleSnapshots(s5, s2, 4); + + Assert.That(result.SnapshotCount, Is.EqualTo(0)); + } + + [Test] + public void AssembleSnapshots_ExactPersistedMatch_AcceptedAsWinner() + { + StateId s2 = CreateStateId(2); + StateId s5 = CreateStateId(5); + + // A persisted base whose From is exactly the target s2. + _tier.ConvertToPersistedBase(CreateSnapshot(s2, s5)).Dispose(); + + using AssembledSnapshotResult result = _repository.AssembleSnapshots(s5, s2, 4); + + Assert.That(result.Persisted.Count, Is.EqualTo(1)); + Assert.That(result.InMemory.Count, Is.EqualTo(0)); + Assert.That(result.Persisted[0].From.BlockNumber, Is.EqualTo(s2.BlockNumber)); + } + [Test] public void AssembleSnapshots_LinearChain_ReturnsAscendingPathToTarget() { BuildSnapshotChain(0, 5); - using SnapshotPooledList assembled = _repository.AssembleSnapshots(CreateStateId(5), CreateStateId(0), 10); + using AssembledSnapshotResult assembled = _repository.AssembleSnapshots(CreateStateId(5), CreateStateId(0), 10); - Assert.That(assembled.Count, Is.EqualTo(5)); - Assert.That(assembled[0].From, Is.EqualTo(CreateStateId(0))); - Assert.That(assembled[^1].To, Is.EqualTo(CreateStateId(5))); + Assert.That(assembled.InMemory.Count, Is.EqualTo(5)); + Assert.That(assembled.InMemory[0].From, Is.EqualTo(CreateStateId(0))); + Assert.That(assembled.InMemory[^1].To, Is.EqualTo(CreateStateId(5))); } [Test] @@ -349,11 +397,11 @@ public void AssembleSnapshots_CompactedSnapshot_TakesWideHop() { AddSnapshotToRepository(0, 5, compacted: true); - using SnapshotPooledList assembled = _repository.AssembleSnapshots(CreateStateId(5), CreateStateId(0), 10); + using AssembledSnapshotResult assembled = _repository.AssembleSnapshots(CreateStateId(5), CreateStateId(0), 10); - Assert.That(assembled.Count, Is.EqualTo(1)); - Assert.That(assembled[0].From, Is.EqualTo(CreateStateId(0))); - Assert.That(assembled[0].To, Is.EqualTo(CreateStateId(5))); + Assert.That(assembled.InMemory.Count, Is.EqualTo(1)); + Assert.That(assembled.InMemory[0].From, Is.EqualTo(CreateStateId(0))); + Assert.That(assembled.InMemory[0].To, Is.EqualTo(CreateStateId(5))); } [Test] @@ -362,11 +410,11 @@ public void AssembleSnapshots_CompactedOvershoot_FallsBackToBaseEdges() BuildSnapshotChain(0, 5); AddSnapshotToRepository(0, 5, compacted: true); - using SnapshotPooledList assembled = _repository.AssembleSnapshots(CreateStateId(5), CreateStateId(2), 10); + using AssembledSnapshotResult assembled = _repository.AssembleSnapshots(CreateStateId(5), CreateStateId(2), 10); - Assert.That(assembled.Count, Is.EqualTo(3)); - Assert.That(assembled[0].From, Is.EqualTo(CreateStateId(2))); - Assert.That(assembled[^1].To, Is.EqualTo(CreateStateId(5))); + Assert.That(assembled.InMemory.Count, Is.EqualTo(3)); + Assert.That(assembled.InMemory[0].From, Is.EqualTo(CreateStateId(2))); + Assert.That(assembled.InMemory[^1].To, Is.EqualTo(CreateStateId(5))); } [Test] @@ -374,9 +422,9 @@ public void AssembleSnapshots_BaseEqualsTarget_ReturnsEmpty() { BuildSnapshotChain(0, 3); - using SnapshotPooledList assembled = _repository.AssembleSnapshots(CreateStateId(3), CreateStateId(3), 10); + using AssembledSnapshotResult assembled = _repository.AssembleSnapshots(CreateStateId(3), CreateStateId(3), 10); - Assert.That(assembled.Count, Is.EqualTo(0)); + Assert.That(assembled.InMemory.Count, Is.EqualTo(0)); } [Test] @@ -384,9 +432,9 @@ public void AssembleSnapshots_UnreachableTarget_ReturnsEmpty() { BuildSnapshotChain(1, 4); - using SnapshotPooledList assembled = _repository.AssembleSnapshots(CreateStateId(4), CreateStateId(0), 10); + using AssembledSnapshotResult assembled = _repository.AssembleSnapshots(CreateStateId(4), CreateStateId(0), 10); - Assert.That(assembled.Count, Is.EqualTo(0)); + Assert.That(assembled.InMemory.Count, Is.EqualTo(0)); } [Test] @@ -394,11 +442,15 @@ public void AssembleSnapshots_SelfReferencingSnapshot_ReturnsEmptyWithoutHanging { AddSnapshotToRepository(CreateStateId(1), CreateStateId(1)); - using SnapshotPooledList assembled = _repository.AssembleSnapshots(CreateStateId(1), CreateStateId(0), 10); + using AssembledSnapshotResult assembled = _repository.AssembleSnapshots(CreateStateId(1), CreateStateId(0), 10); - Assert.That(assembled.Count, Is.EqualTo(0)); + Assert.That(assembled.InMemory.Count, Is.EqualTo(0)); } + #endregion + + #region RemoveSiblingAndDescendents + [Test] public void RemoveSiblingAndDescendents_LinearChain_RemovesNothing() { @@ -443,4 +495,6 @@ public void RemoveSiblingAndDescendents_ForkAbovePersistedBlock_KeepsBothBranche Assert.That(_repository.HasState(CreateStateId(7)), Is.True); Assert.That(_repository.HasState(CreateStateId(7, rootByte: 1)), Is.True); } + + #endregion } diff --git a/src/Nethermind/Nethermind.State.Flat.Test/Sorted/BlockTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/Sorted/BlockTests.cs new file mode 100644 index 000000000000..7750d106f886 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/Sorted/BlockTests.cs @@ -0,0 +1,196 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.Buffers.Binary; +using System.Collections.Generic; +using Nethermind.Core.Extensions; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test.Sorted; + +[TestFixture] +public class BlockTests +{ + private static byte[] BuildBlock(int restartInterval, (byte[] Key, byte[] Value)[] entries) + { + using PooledByteBufferWriter pooled = new(256); + using BlockBuilder block = new(restartInterval); + foreach ((byte[] key, byte[] value) in entries) + block.Add(key, value); + block.Finish(ref pooled.GetWriter()); + return pooled.WrittenSpan.ToArray(); + } + + private static bool SeekCeiling(byte[] block, ReadOnlySpan target, out byte[] key, out byte[] value) + { + SpanByteReader reader = new(block); + Span keyBuf = stackalloc byte[256]; + if (!BlockReader.SeekCeiling(in reader, 0, target, keyBuf, out int keyLen, out Bound v)) + { + key = []; + value = []; + return false; + } + key = keyBuf[..keyLen].ToArray(); + value = new byte[v.Length]; + reader.TryRead(v.Offset, value); + return true; + } + + private static bool SeekCeilingClamped(byte[] block, ReadOnlySpan target, long firstRestart, long lastRestart, out byte[] key, out byte[] value) + { + SpanByteReader reader = new(block); + Span keyBuf = stackalloc byte[256]; + if (!BlockReader.SeekCeiling(in reader, 0, target, keyBuf, out int keyLen, out Bound v, firstRestart, lastRestart)) + { + key = []; + value = []; + return false; + } + key = keyBuf[..keyLen].ToArray(); + value = new byte[v.Length]; + reader.TryRead(v.Offset, value); + return true; + } + + [Test] + public void Picks_width_2_for_a_small_block() + { + (byte[], byte[])[] entries = + [ + (Bytes.FromHexString("10"), Bytes.FromHexString("aa")), + (Bytes.FromHexString("20"), Bytes.FromHexString("bb")), + (Bytes.FromHexString("30"), Bytes.FromHexString("cc")), + ]; + byte[] block = BuildBlock(8, entries); + Assert.That(block[0], Is.EqualTo(Block.Width2)); + + foreach ((byte[] key, byte[] value) in entries) + { + Assert.That(SeekCeiling(block, key, out byte[] gotKey, out byte[] gotVal), Is.True); + Assert.That(gotKey, Is.EqualTo(key)); + Assert.That(gotVal, Is.EqualTo(value)); + } + } + + // Enough records that recordsEnd exceeds 65535, forcing the 4-byte offset width — the path the + // multi-MB index block takes for a full-state snapshot, exercised cheaply at the block layer. + [Test] + public void Picks_width_4_when_block_exceeds_64KiB() + { + const int count = 8000; + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[count]; + for (int i = 0; i < count; i++) + { + byte[] key = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(key, i); + entries[i] = (key, [(byte)i, (byte)(i >> 8), 0xAB, 0xCD, 0xEF, 0x01, 0x02, 0x03]); + } + byte[] block = BuildBlock(8, entries); + Assert.That(block[0], Is.EqualTo(Block.Width4), "recordsEnd > 65535 must select the 4-byte width"); + + foreach (int i in (int[])[0, 1, 100, 4000, 7999]) + { + Assert.That(SeekCeiling(block, entries[i].Key, out byte[] gotKey, out byte[] gotVal), Is.True); + Assert.That(gotKey, Is.EqualTo(entries[i].Key)); + Assert.That(gotVal, Is.EqualTo(entries[i].Value)); + } + + byte[] pastEnd = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(pastEnd, count); + Assert.That(SeekCeiling(block, pastEnd, out _, out _), Is.False); + } + + [Test] + public void Ceiling_before_first_key_returns_first() + { + byte[] block = BuildBlock(8, + [ + (Bytes.FromHexString("10"), Bytes.FromHexString("a0")), + (Bytes.FromHexString("20"), Bytes.FromHexString("a1")), + (Bytes.FromHexString("30"), Bytes.FromHexString("a2")), + ]); + Assert.That(SeekCeiling(block, Bytes.FromHexString("05"), out byte[] key, out byte[] value), Is.True); + Assert.That(key, Is.EqualTo(Bytes.FromHexString("10"))); + Assert.That(value, Is.EqualTo(Bytes.FromHexString("a0"))); + } + + // 9 records at interval 8 ⇒ two restart runs (records 0..7, then record 8). A target between the + // last key of run 0 and the first key of run 1 must scan ACROSS the restart boundary — guards the + // "scan to recordsEnd, not runEnd" rule. + [Test] + public void Ceiling_in_gap_scans_across_restart_runs() + { + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[9]; + for (int i = 0; i < 8; i++) entries[i] = ([(byte)i], [(byte)i]); + entries[8] = (Bytes.FromHexString("10"), Bytes.FromHexString("ff")); // first key of restart run 1 + + byte[] block = BuildBlock(8, entries); + Assert.That(SeekCeiling(block, Bytes.FromHexString("0a"), out byte[] key, out byte[] value), Is.True); + Assert.That(key, Is.EqualTo(Bytes.FromHexString("10"))); + Assert.That(value, Is.EqualTo(Bytes.FromHexString("ff"))); + } + + [Test] + public void Ceiling_past_last_key_returns_false() + { + byte[] block = BuildBlock(8, + [ + (Bytes.FromHexString("10"), Bytes.FromHexString("a0")), + (Bytes.FromHexString("20"), Bytes.FromHexString("a1")), + ]); + Assert.That(SeekCeiling(block, Bytes.FromHexString("30"), out _, out _), Is.False); + } + + [Test] + public void Ceiling_on_empty_block_returns_false() + { + byte[] block = BuildBlock(8, []); + Assert.That(SeekCeiling(block, Bytes.FromHexString("00"), out _, out _), Is.False); + } + + // Clamping the restart binary search to a window that still contains the target's predecessor restart + // must return the byte-identical ceiling an unclamped search does — the optimization SortedTableReader + // uses to confine an index lookup to one column. 50 records at interval 8 give 7 restart runs; present + // keys are even so odd probes fall in in-run gaps, and the predecessor restart of a key at record r is + // r/8, so the single-restart window [r/8, r/8] (and [r/8, last] for a probe) suffices. + [Test] + public void SeekCeiling_clamped_to_restart_window_matches_unclamped() + { + const int interval = 8; + const int count = 50; + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[count]; + for (int i = 0; i < count; i++) + { + byte[] key = new byte[2]; + BinaryPrimitives.WriteUInt16BigEndian(key, (ushort)(2 * i)); // even + entries[i] = (key, [(byte)i]); + } + byte[] block = BuildBlock(interval, entries); + + for (int i = 0; i < count; i++) + { + long restart = i / interval; + + // Present key: its own single-restart window must reproduce the unclamped result. + bool baseFound = SeekCeiling(block, entries[i].Key, out byte[] bk, out byte[] bv); + bool clampedFound = SeekCeilingClamped(block, entries[i].Key, restart, restart, out byte[] ck, out byte[] cv); + Assert.That(clampedFound, Is.EqualTo(baseFound)); + Assert.That(ck, Is.EqualTo(bk)); + Assert.That(cv, Is.EqualTo(bv)); + + // Absent odd probe: ceiling is the next key (possibly in a later restart run, reached by the + // unbounded forward scan); window upper bound is clamped into range. + byte[] probe = new byte[2]; + BinaryPrimitives.WriteUInt16BigEndian(probe, (ushort)(2 * i + 1)); + bool pBase = SeekCeiling(block, probe, out byte[] pbk, out byte[] pbv); + bool pClamped = SeekCeilingClamped(block, probe, restart, count, out byte[] pck, out byte[] pcv); + Assert.That(pClamped, Is.EqualTo(pBase)); + Assert.That(pck, Is.EqualTo(pbk)); + Assert.That(pcv, Is.EqualTo(pbv)); + } + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/Sorted/SortedTableTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/Sorted/SortedTableTests.cs new file mode 100644 index 000000000000..682909639c8a --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/Sorted/SortedTableTests.cs @@ -0,0 +1,483 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.Buffers.Binary; +using System.Collections.Generic; +using System.Linq; +using Nethermind.Core.Extensions; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test.Sorted; + +[TestFixture] +public class SortedTableTests +{ + // Mixed key lengths, a prefix pair ("00" / "0000"), and an empty value. + private static (byte[] Key, byte[] Value)[] SampleEntries() => + [ + (Bytes.FromHexString("00"), Bytes.FromHexString("aa")), + (Bytes.FromHexString("0000"), []), + (Bytes.FromHexString("01ff"), Bytes.FromHexString("0102030405")), + (Bytes.FromHexString("7f"), Bytes.FromHexString("01")), + (Bytes.FromHexString("fe00112233"), Bytes.FromHexString("99")), + (Bytes.FromHexString("ff"), Bytes.FromHexString("deadbeef")), + ]; + + // The builder requires strictly ascending keys, so feed them sorted regardless of input order. + private static byte[] BuildTable((byte[] Key, byte[] Value)[] entries) + { + (byte[] Key, byte[] Value)[] sorted = [.. entries]; + Array.Sort(sorted, static (x, y) => x.Key.AsSpan().SequenceCompareTo(y.Key)); + + using PooledByteBufferWriter pooled = new(256); + SortedTableBuilder table = new(ref pooled.GetWriter()); + try + { + foreach ((byte[] Key, byte[] Value) e in sorted) + table.Add(e.Key, e.Value); + table.Build(); + } + finally + { + table.Dispose(); + } + return pooled.WrittenSpan.ToArray(); + } + + private static long DataBlockCount(byte[] bytes) + { + SpanByteReader reader = new(bytes); + Assert.That(SortedTable.TryReadFooter(in reader, new Bound(0, reader.Length), out SortedTable.Footer footer), Is.True); + return footer.NumDataBlocks; + } + + private static bool Seek(byte[] bytes, ReadOnlySpan key, out byte[] value) + { + SpanByteReader reader = new(bytes); + if (!SortedTableReader.TrySeek(in reader, new Bound(0, reader.Length), key, out Bound v)) + { + value = []; + return false; + } + value = new byte[v.Length]; + reader.TryRead(v.Offset, value); + return true; + } + + private static bool SeekInColumn(byte[] bytes, long loBlock, long hiBlock, ReadOnlySpan key, out byte[] value) + { + SpanByteReader reader = new(bytes); + Bound table = new(0, reader.Length); + Assert.That(SortedTable.TryReadFooter(in reader, table, out SortedTable.Footer footer), Is.True); + if (!SortedTableReader.TrySeekInColumn(in reader, table, in footer, loBlock, hiBlock, key, out Bound v)) + { + value = []; + return false; + } + value = new byte[v.Length]; + reader.TryRead(v.Offset, value); + return true; + } + + // Resolve a column's inclusive data-block range the way PersistedSnapshotColumnBounds does: the + // column's first block, up to (inclusive) the next boundary's first block — the next boundary may + // share a block with this column's tail. + private static (long Lo, long Hi) ColumnRange(byte[] bytes, byte tag, byte nextTag) + { + SpanByteReader reader = new(bytes); + Bound table = new(0, reader.Length); + Assert.That(SortedTable.TryReadFooter(in reader, table, out SortedTable.Footer footer), Is.True); + long last = footer.NumDataBlocks - 1; + Span k = stackalloc byte[1]; + k[0] = tag; + long lo = SortedTableReader.TryFindStartBlock(in reader, table, in footer, k, out long lb) ? lb : footer.NumDataBlocks; + k[0] = nextTag; + long hi = SortedTableReader.TryFindStartBlock(in reader, table, in footer, k, out long hb) ? hb : footer.NumDataBlocks; + lo = Math.Clamp(lo, 0, last); + hi = Math.Clamp(hi, lo, last); + return (lo, hi); + } + + private static void AssertSameSeek(byte[] bytes, long lo, long hi, byte[] key) + { + bool baseFound = Seek(bytes, key, out byte[] baseVal); + bool colFound = SeekInColumn(bytes, lo, hi, key, out byte[] colVal); + Assert.That(colFound, Is.EqualTo(baseFound), $"presence mismatch for {key.ToHexString()}"); + Assert.That(colVal, Is.EqualTo(baseVal), $"value mismatch for {key.ToHexString()}"); + } + + private static List Enumerate(byte[] bytes) + { + SpanByteReader reader = new(bytes); + SortedTableEnumerator e = new(in reader, new Bound(0, reader.Length)); + List keys = []; + while (e.MoveNext(in reader)) keys.Add(e.CurrentKey.ToArray()); + return keys; + } + + [Test] + public void Round_trips_every_key_and_reports_misses() + { + (byte[] Key, byte[] Value)[] entries = SampleEntries(); + byte[] bytes = BuildTable(entries); + + foreach ((byte[] key, byte[] value) in entries) + { + Assert.That(Seek(bytes, key, out byte[] got), Is.True, $"key {key.ToHexString()} should be found"); + Assert.That(got, Is.EqualTo(value), $"value for {key.ToHexString()}"); + } + + // Misses: an absent key, and a key that is a prefix of a present one but not itself present. + Assert.That(Seek(bytes, Bytes.FromHexString("02"), out _), Is.False); + Assert.That(Seek(bytes, Bytes.FromHexString("0001"), out _), Is.False); + Assert.That(Seek(bytes, Bytes.FromHexString("ffff"), out _), Is.False); + } + + [Test] + public void Add_rejects_non_ascending_and_duplicate_keys() + { + Assert.That(static () => AddPair(Bytes.FromHexString("02"), Bytes.FromHexString("01")), Throws.ArgumentException, "descending key"); + Assert.That(static () => AddPair(Bytes.FromHexString("02"), Bytes.FromHexString("02")), Throws.ArgumentException, "duplicate key"); + Assert.That(static () => AddPair(Bytes.FromHexString("01"), Bytes.FromHexString("02")), Throws.Nothing, "ascending key"); + + // Separate method so the ref-struct builder is never captured by the assertion delegate. + static void AddPair(byte[] first, byte[] second) + { + using PooledByteBufferWriter pooled = new(256); + SortedTableBuilder table = new(ref pooled.GetWriter()); + try + { + table.Add(first, Bytes.FromHexString("aa")); + table.Add(second, Bytes.FromHexString("bb")); + } + finally + { + table.Dispose(); + } + } + } + + [Test] + public void Enumerates_in_ascending_key_order() + { + (byte[] Key, byte[] Value)[] entries = SampleEntries(); + byte[] bytes = BuildTable(entries); + + List keys = Enumerate(bytes); + Assert.That(keys.Count, Is.EqualTo(entries.Length)); + for (int i = 1; i < keys.Count; i++) + Assert.That(keys[i - 1].AsSpan().SequenceCompareTo(keys[i]), Is.LessThan(0), "keys must be strictly ascending"); + } + + [Test] + public void Empty_table_seeks_and_enumerates_nothing() + { + byte[] bytes = BuildTable([]); + Assert.That(DataBlockCount(bytes), Is.EqualTo(0)); + Assert.That(Seek(bytes, Bytes.FromHexString("00"), out _), Is.False); + Assert.That(Enumerate(bytes), Is.Empty); + } + + [Test] + public void Single_record_round_trips() + { + (byte[] Key, byte[] Value)[] entries = [(Bytes.FromHexString("abcdef"), Bytes.FromHexString("1234"))]; + byte[] bytes = BuildTable(entries); + + Assert.That(DataBlockCount(bytes), Is.EqualTo(1)); + Assert.That(Seek(bytes, entries[0].Key, out byte[] got), Is.True); + Assert.That(got, Is.EqualTo(entries[0].Value)); + Assert.That(Seek(bytes, Bytes.FromHexString("abcdee"), out _), Is.False); // before + Assert.That(Seek(bytes, Bytes.FromHexString("abcdff"), out _), Is.False); // after + Assert.That(Enumerate(bytes).Count, Is.EqualTo(1)); + } + + // A single 4 KB block, exercising restart-run boundaries around RestartInterval (= 8): the + // builder resets front-coding every restart, the reader binary-searches restarts then scans one run. + [TestCase(7)] + [TestCase(8)] + [TestCase(9)] + [TestCase(16)] + [TestCase(24)] + [TestCase(25)] + [TestCase(48)] + public void Restart_boundaries_within_one_block(int count) + { + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[count]; + for (int i = 0; i < count; i++) + { + byte[] key = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(key, i); + entries[i] = (key, [(byte)i, (byte)(i + 1)]); + } + byte[] bytes = BuildTable(entries); + + Assert.That(DataBlockCount(bytes), Is.EqualTo(1), "small values keep all records in one block"); + for (int i = 0; i < count; i++) + { + Assert.That(Seek(bytes, entries[i].Key, out byte[] got), Is.True); + Assert.That(got, Is.EqualTo(entries[i].Value)); + } + byte[] missing = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(missing, count); + Assert.That(Seek(bytes, missing, out _), Is.False); + } + + // Exercise the last-block fill across single-block sizes 1..17. + [TestCase(1)] + [TestCase(7)] + [TestCase(8)] + [TestCase(9)] + [TestCase(16)] + [TestCase(17)] + public void Round_trips_across_record_counts(int count) + { + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[count]; + for (int i = 0; i < count; i++) + { + byte[] key = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(key, i); + entries[i] = (key, [(byte)i]); + } + byte[] bytes = BuildTable(entries); + + for (int i = 0; i < count; i++) + { + Assert.That(Seek(bytes, entries[i].Key, out byte[] got), Is.True); + Assert.That(got, Is.EqualTo(entries[i].Value)); + } + byte[] missing = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(missing, count); + Assert.That(Seek(bytes, missing, out _), Is.False); + } + + // Large values force many 4 KB blocks. Present keys are odd, so every even probe lands in a gap — + // including gaps that straddle a block boundary (the separator lower-bound + in-block re-validation), + // plus the before-first and after-last sentinels. + [TestCase(50)] + [TestCase(800)] + [TestCase(4000)] + public void Round_trips_multiblock_with_gaps(int count) + { + byte[] value = new byte[200]; + for (int i = 0; i < value.Length; i++) value[i] = (byte)i; + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[count]; + for (int i = 0; i < count; i++) + { + byte[] key = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(key, 2 * i + 1); // odd + entries[i] = (key, value); + } + byte[] bytes = BuildTable(entries); + + Assert.That(DataBlockCount(bytes), Is.GreaterThan(1), "200-byte values span multiple 4 KB blocks"); + + for (int i = 0; i < count; i++) + { + Assert.That(Seek(bytes, entries[i].Key, out byte[] got), Is.True, $"present key #{i}"); + Assert.That(got, Is.EqualTo(value)); + + byte[] gap = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(gap, 2 * i); // even: before-first (i==0) or between two present keys + Assert.That(Seek(bytes, gap, out _), Is.False, $"gap key {2 * i}"); + } + byte[] after = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(after, 2 * count); // > last present key + Assert.That(Seek(bytes, after, out _), Is.False); + + List keys = Enumerate(bytes); + Assert.That(keys.Count, Is.EqualTo(count)); + for (int i = 1; i < keys.Count; i++) + Assert.That(keys[i - 1].AsSpan().SequenceCompareTo(keys[i]), Is.LessThan(0), "ascending across every block boundary"); + } + + // 32-byte keys sharing a 30-byte prefix, differing only in the last two bytes — exercises long + // front-coded cp within restart runs and the cp == 0 reset at each restart and block boundary. + [TestCase(20)] + [TestCase(4000)] + public void Long_shared_prefix_round_trips(int count) + { + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[count]; + for (int i = 0; i < count; i++) + { + byte[] key = new byte[32]; + key.AsSpan(0, 30).Fill(0xAB); + BinaryPrimitives.WriteUInt16BigEndian(key.AsSpan(30), (ushort)i); + entries[i] = (key, [(byte)i, (byte)(i + 1)]); + } + byte[] bytes = BuildTable(entries); + + for (int i = 0; i < count; i++) + { + Assert.That(Seek(bytes, entries[i].Key, out byte[] got), Is.True); + Assert.That(got, Is.EqualTo(entries[i].Value)); + } + + // Enumeration reconstructs the full 32-byte keys in ascending order. + List keys = Enumerate(bytes); + Assert.That(keys.Count, Is.EqualTo(count)); + for (int i = 0; i < count; i++) + { + Assert.That(keys[i].Length, Is.EqualTo(32)); + Assert.That(BinaryPrimitives.ReadUInt16BigEndian(keys[i].AsSpan(30)), Is.EqualTo((ushort)i)); + } + + byte[] missing = new byte[32]; + missing.AsSpan(0, 30).Fill(0xAB); + BinaryPrimitives.WriteUInt16BigEndian(missing.AsSpan(30), (ushort)count); + Assert.That(Seek(bytes, missing, out _), Is.False); + } + + // Fuzz arbitrary block fills, restart placements, separator computation and front-coding across + // boundaries with random unique keys (1..55 B) and values (0..254 B). + [TestCase(1)] + [TestCase(7)] + [TestCase(42)] + public void Fuzz_round_trips_random_tables(int seed) + { + Random rng = new(seed); + for (int iter = 0; iter < 25; iter++) + { + int count = rng.Next(1, 1500); + Dictionary map = new(count); + while (map.Count < count) + { + byte[] key = new byte[rng.Next(1, 56)]; + rng.NextBytes(key); + byte[] value = new byte[rng.Next(0, 255)]; + rng.NextBytes(value); + map[key.ToHexString()] = value; + } + + (byte[] Key, byte[] Value)[] entries = [.. map.Select(kv => (Bytes.FromHexString(kv.Key), kv.Value))]; + byte[] bytes = BuildTable(entries); + + foreach ((byte[] key, byte[] value) in entries) + { + Assert.That(Seek(bytes, key, out byte[] got), Is.True); + Assert.That(got, Is.EqualTo(value)); + } + + // Random probes; most are absent. Compare against the source map for the verdict. + for (int p = 0; p < 50; p++) + { + byte[] probe = new byte[rng.Next(1, 56)]; + rng.NextBytes(probe); + bool present = map.TryGetValue(probe.ToHexString(), out byte[]? expected); + Assert.That(Seek(bytes, probe, out byte[] got), Is.EqualTo(present)); + if (present) Assert.That(got, Is.EqualTo(expected)); + } + + List keys = Enumerate(bytes); + Assert.That(keys.Count, Is.EqualTo(entries.Length)); + for (int i = 1; i < keys.Count; i++) + Assert.That(keys[i - 1].AsSpan().SequenceCompareTo(keys[i]), Is.LessThan(0)); + } + } + + // Every data block but the last is zero-padded to BlockSize, so data block i starts at i*BlockSize. + // The (unaligned) index block is located by the footer's IndexOffset, right after the last block. + [Test] + public void Data_blocks_are_4k_aligned_and_index_located_by_offset() + { + const int count = 300; + byte[] value = new byte[200]; + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[count]; + for (int i = 0; i < count; i++) + { + byte[] key = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(key, i); + entries[i] = (key, value); + } + byte[] bytes = BuildTable(entries); + + SpanByteReader reader = new(bytes); + Bound table = new(0, reader.Length); + Assert.That(SortedTable.TryReadFooter(in reader, table, out SortedTable.Footer footer), Is.True); + long m = footer.NumDataBlocks; + Assert.That(m, Is.GreaterThan(1)); + + for (long i = 0; i < m; i++) + Assert.That(BlockReader.ReadHeader(in reader, i * SortedTable.BlockSize, out int w, out _, out _, out _) && (w is Block.Width2 or Block.Width4), + Is.True, $"data block {i} at {i * SortedTable.BlockSize}"); + + // The index block is located directly by the footer's IndexOffset (it is not block-aligned and + // begins right after the last, unpadded, data block). + Assert.That(footer.IndexOffset, Is.GreaterThanOrEqualTo((m - 1) * SortedTable.BlockSize)); + Assert.That(BlockReader.ReadHeader(in reader, SortedTable.IndexBlockStart(table, footer), out _, out _, out _, out _), Is.True, "index block at IndexOffset"); + } + + // u32 block number * 4 KiB reaches ~16 TiB; the helper must widen before multiplying. + [Test] + public void Block_number_addressing_does_not_overflow() => + Assert.That(SortedTable.DataBlockStart(new Bound(0, 0), uint.MaxValue), Is.EqualTo((long)uint.MaxValue * SortedTable.BlockSize)); + + // A clamped per-column seek (the path PersistedSnapshot uses) must return byte-identical results to an + // unclamped whole-table TrySeek — for present keys, in-column gap keys, and keys past a column's end + // (which resolve into the next column's first block via the inclusive upper edge). valueSize 8 keeps + // columns small enough to share blocks (boundary straddle, collapsed restart window); 200 spreads each + // column across several 4 KiB blocks so the index search is genuinely narrowed. + [TestCase(8)] + [TestCase(200)] + public void TrySeekInColumn_matches_TrySeek(int valueSize) + { + byte[] tags = [0x10, 0x20, 0x30]; + const int perColumn = 300; + byte[] value = new byte[valueSize]; + for (int i = 0; i < value.Length; i++) value[i] = (byte)i; + + List<(byte[] Key, byte[] Value)> list = []; + foreach (byte tag in tags) + for (int i = 0; i < perColumn; i++) + { + int counter = 2 * i; // even ⇒ odd probes land in in-column gaps + list.Add(([tag, (byte)(counter >> 16), (byte)(counter >> 8), (byte)counter], value)); + } + byte[] bytes = BuildTable([.. list]); + Assert.That(DataBlockCount(bytes), Is.GreaterThan(1), "table must span several blocks for clamping to matter"); + + for (int t = 0; t < tags.Length; t++) + { + // Last column's next boundary is a tag above every key, so its run extends to the last block. + byte nextTag = t + 1 < tags.Length ? tags[t + 1] : (byte)0xFF; + (long lo, long hi) = ColumnRange(bytes, tags[t], nextTag); + + for (int i = 0; i <= perColumn; i++) // perColumn includes the just-past-the-end boundary probe + { + int present = 2 * i; + AssertSameSeek(bytes, lo, hi, [tags[t], (byte)(present >> 16), (byte)(present >> 8), (byte)present]); + int absent = 2 * i + 1; + AssertSameSeek(bytes, lo, hi, [tags[t], (byte)(absent >> 16), (byte)(absent >> 8), (byte)absent]); + } + } + } + + [Test] + public void Large_table_round_trips_across_many_blocks() + { + // Enough entries to span many data blocks and a sizeable index block. + const int count = 5000; + (byte[] Key, byte[] Value)[] entries = new (byte[], byte[])[count]; + for (int i = 0; i < count; i++) + { + byte[] key = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(key, i); + entries[i] = (key, [(byte)(i & 0xFF), (byte)((i >> 8) & 0xFF)]); + } + byte[] bytes = BuildTable(entries); + Assert.That(DataBlockCount(bytes), Is.GreaterThan(1)); + + for (int i = 0; i < count; i++) + { + Assert.That(Seek(bytes, entries[i].Key, out byte[] got), Is.True); + Assert.That(got, Is.EqualTo(entries[i].Value)); + } + + byte[] missing = new byte[4]; + BinaryPrimitives.WriteInt32BigEndian(missing, count + 1); + Assert.That(Seek(bytes, missing, out _), Is.False); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/StorageLayerTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/StorageLayerTests.cs new file mode 100644 index 000000000000..e528ed785610 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/StorageLayerTests.cs @@ -0,0 +1,377 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.IO; +using System.Linq; +using Nethermind.Core.Crypto; +using Nethermind.Db; +using Nethermind.Logging; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using NUnit.Framework; + +namespace Nethermind.State.Flat.Test; + +[TestFixture] +public class StorageLayerTests +{ + private string _testDir = null!; + + // Look up a catalog entry by (To, depth) over the loaded list — the catalog has no Find method + // and no in-memory index; Load() reads the current state from the DB each call. + private static CatalogEntry? FindEntry(SnapshotCatalog catalog, StateId to, long depth) => + catalog.Load().FirstOrDefault(e => e.To.Equals(to) && e.To.BlockNumber - e.From.BlockNumber == depth); + + [SetUp] + public void SetUp() + { + _testDir = Path.Combine(Path.GetTempPath(), $"nethermind_test_{Guid.NewGuid():N}"); + Directory.CreateDirectory(_testDir); + } + + [TearDown] + public void TearDown() + { + if (Directory.Exists(_testDir)) + Directory.Delete(_testDir, recursive: true); + } + + [Test] + public unsafe void ArenaFile_WriteViaStreamAndRead_RoundTrips() + { + string path = Path.Combine(_testDir, "arena.bin"); + byte[] data1 = [1, 2, 3, 4, 5]; + byte[] data2 = new byte[1000]; + Random.Shared.NextBytes(data2); + + using ArenaFile arena = new(0, path, 1024 * 1024); + + using (FileStream fs = new(path, FileMode.OpenOrCreate, FileAccess.Write, FileShare.ReadWrite)) + { + fs.Write(data1); + fs.Write(data2); + fs.Flush(); + } + + // Read back via the raw mmap pointer — the same access path ArenaByteReader uses. + Assert.That(new ReadOnlySpan(arena.BasePtr, data1.Length).ToArray(), Is.EqualTo(data1)); + Assert.That(new ReadOnlySpan(arena.BasePtr + data1.Length, data2.Length).ToArray(), Is.EqualTo(data2)); + Assert.That(arena.MappedSize, Is.EqualTo(1024 * 1024)); + } + + [Test] + public void SnapshotCatalog_SaveLoad_RoundTrips() + { + MemDb catalogDb = new(); + // Same To across three entries with distinct depths (1 / 2 / 4) — mirrors the + // runtime case where a base + sub-CompactSize compacted + CompactSized snapshot + // all end at the same block. Pre-v7 catalog would collapse these to one entry on + // disk; v7 keys by (To, depth) and round-trips all three. + StateId s_base_from = new(99, Keccak.Compute("block99")); // depth=1 source + StateId s_compacted_from = new(98, Keccak.Compute("block98")); // depth=2 source + StateId s_compactSized_from = new(96, Keccak.Compute("block96")); // depth=4 source + StateId sharedTo = new(100, Keccak.Compute("block100")); + StateId s2 = new(200, Keccak.Compute("block200")); + + SnapshotCatalog catalog = new(catalogDb); + catalog.Add(new(s_base_from, sharedTo, new(0, 0, 1024), SnapshotTier.PersistedBase)); + catalog.Add(new(s_compacted_from, sharedTo, new(0, 1024, 2048), SnapshotTier.PersistedSmallCompacted)); + catalog.Add(new(s_compactSized_from, sharedTo, new(0, 3072, 4096), SnapshotTier.PersistedCompactSized)); + catalog.Add(new(sharedTo, s2, new(0, 7168, 2048), SnapshotTier.PersistedCompactSized)); + + SnapshotCatalog loaded = new(catalogDb); + + Assert.That(loaded.Load().Count, Is.EqualTo(4)); + + CatalogEntry? loadedBase = FindEntry(loaded, sharedTo, depth: 1); + CatalogEntry? loadedCompacted = FindEntry(loaded, sharedTo, depth: 2); + CatalogEntry? loadedCompactSized = FindEntry(loaded, sharedTo, depth: 4); + Assert.That(loadedBase, Is.Not.Null); + Assert.That(loadedBase!.From, Is.EqualTo(s_base_from)); + Assert.That(loadedBase.Location, Is.EqualTo(new SnapshotLocation(0, 0, 1024))); + Assert.That(loadedBase.Tier, Is.EqualTo(SnapshotTier.PersistedBase)); + Assert.That(loadedCompacted, Is.Not.Null); + Assert.That(loadedCompacted!.From, Is.EqualTo(s_compacted_from)); + Assert.That(loadedCompacted.Location, Is.EqualTo(new SnapshotLocation(0, 1024, 2048))); + Assert.That(loadedCompacted.Tier, Is.EqualTo(SnapshotTier.PersistedSmallCompacted)); + Assert.That(loadedCompactSized, Is.Not.Null); + Assert.That(loadedCompactSized!.From, Is.EqualTo(s_compactSized_from)); + Assert.That(loadedCompactSized.Location, Is.EqualTo(new SnapshotLocation(0, 3072, 4096))); + Assert.That(loadedCompactSized.Tier, Is.EqualTo(SnapshotTier.PersistedCompactSized)); + + CatalogEntry? loadedTail = FindEntry(loaded, s2, depth: 100); + Assert.That(loadedTail, Is.Not.Null); + Assert.That(loadedTail!.From, Is.EqualTo(sharedTo)); + Assert.That(loadedTail.Location, Is.EqualTo(new SnapshotLocation(0, 7168, 2048))); + Assert.That(loadedTail.Tier, Is.EqualTo(SnapshotTier.PersistedCompactSized)); + } + + [Test] + public void SnapshotCatalog_Remove_And_Find() + { + StateId s0 = new(0, Keccak.EmptyTreeHash); + StateId s_compactedFrom = new(0, Keccak.Compute("compactedFrom")); + StateId s1 = new(1, Keccak.Compute("1")); + StateId s2 = new(2, Keccak.Compute("2")); + StateId missing = new(999, Keccak.Compute("missing")); + + SnapshotCatalog catalog = new(new MemDb()); + catalog.Add(new(s0, s1, new(0, 0, 100), SnapshotTier.PersistedBase)); + catalog.Add(new(s1, s2, new(0, 100, 200), SnapshotTier.PersistedBase)); + // Same To (s2), different depth (s_compactedFrom→s2 has depth=2 vs s1→s2 depth=1). + catalog.Add(new(s_compactedFrom, s2, new(0, 200, 100), SnapshotTier.PersistedSmallCompacted)); + + Assert.That(FindEntry(catalog, s1, depth: 1), Is.Not.Null); + Assert.That(catalog.Remove(s1, depth: 1), Is.True); + Assert.That(FindEntry(catalog, s1, depth: 1), Is.Null); + Assert.That(catalog.Load().Count(), Is.EqualTo(2)); + Assert.That(catalog.Remove(missing, depth: 1), Is.False); + + // Removing one (To, depth) leaves the sibling at the same To intact. + Assert.That(FindEntry(catalog, s2, depth: 1), Is.Not.Null); + Assert.That(FindEntry(catalog, s2, depth: 2), Is.Not.Null); + Assert.That(catalog.Remove(s2, depth: 1), Is.True); + Assert.That(FindEntry(catalog, s2, depth: 1), Is.Null); + Assert.That(FindEntry(catalog, s2, depth: 2), Is.Not.Null); + } + + + [Test] + public void SnapshotCatalog_Load_EmptyOrMissing_ReturnsEmpty() + { + SnapshotCatalog catalog = new(new MemDb()); + + Assert.That(catalog.Load(), Is.Empty); + } + + [Test] + public void ArenaManager_CreateWriterAndComplete_WritesToArena() + { + string arenaDir = Path.Combine(_testDir, "arenas"); + using ArenaManager manager = new(arenaDir, new FlatDbConfig + { + ArenaFileSizeBytes = 4096, + }, LimboLogs.Instance); + manager.Initialize([]); + + byte[] data = [1, 2, 3, 4, 5, 6, 7, 8]; + + SnapshotLocation location; + using (ArenaWriter arenaWriter = manager.CreateWriter(data.Length)) + { + Span span = arenaWriter.GetWriter().GetSpan(data.Length); + data.CopyTo(span); + arenaWriter.GetWriter().Advance(data.Length); + (location, _) = arenaWriter.Complete(); + } + + using (WholeReadSession session = manager.Open(location).BeginWholeReadSession()) + Assert.That(TestFixtureHelpers.ReadAll(session), Is.EqualTo(data)); + Assert.That(location.Size, Is.EqualTo(data.Length)); + } + + // Both pools (non-small and small) share the same reserve / cancel / re-add lifecycle, so the + // cancelled-write reuse must hold for each independently. + [TestCase(false)] + [TestCase(true)] + public void ArenaManager_CancelWrite_AllowsReuse(bool small) + { + string arenaDir = Path.Combine(_testDir, "arenas"); + // 64 KiB so two page-aligned reservations fit in one shared arena file. + using ArenaManager manager = new(arenaDir, new FlatDbConfig + { + ArenaFileSizeBytes = 64 * 1024, + }, LimboLogs.Instance); + manager.Initialize([]); + + byte[] baseline = [0xAA]; + SnapshotLocation baselineLoc; + using (ArenaWriter bw = manager.CreateWriter(baseline.Length, small)) + { + Span span = bw.GetWriter().GetSpan(baseline.Length); + baseline.CopyTo(span); + bw.GetWriter().Advance(baseline.Length); + (baselineLoc, _) = bw.Complete(); + } + + using (ArenaWriter arenaWriter = manager.CreateWriter(0, small)) + { + // Don't call Complete — Dispose will cancel the write and return the file to its pool. + } + + byte[] data = new byte[50]; + SnapshotLocation loc; + using (ArenaWriter w = manager.CreateWriter(data.Length, small)) + { + Span span = w.GetWriter().GetSpan(data.Length); + data.CopyTo(span); + w.GetWriter().Advance(data.Length); + (loc, _) = w.Complete(); + } + // The reused write starts at the page-aligned frontier after the baseline reservation — + // i.e. it landed in the same file, proving the cancelled write returned to the right pool. + Assert.That(loc.ArenaId, Is.EqualTo(baselineLoc.ArenaId)); + Assert.That(loc.Offset, Is.EqualTo(PageLayout.RoundUpToOsPage(baselineLoc.Offset + baselineLoc.Size))); + } + + [Test] + public void ArenaManager_CreateWriter_NextReservationIsPageAligned() + { + string arenaDir = Path.Combine(_testDir, "arenas"); + // 64 KiB so two page-aligned reservations fit in one shared arena file. + using ArenaManager manager = new(arenaDir, new FlatDbConfig + { + ArenaFileSizeBytes = 64 * 1024, + }, LimboLogs.Instance); + manager.Initialize([]); + + byte[] data = [1, 2, 3]; + SnapshotLocation location; + using (ArenaWriter arenaWriter = manager.CreateWriter(data.Length)) + { + Span span = arenaWriter.GetWriter().GetSpan(data.Length); + data.CopyTo(span); + arenaWriter.GetWriter().Advance(data.Length); + (location, _) = arenaWriter.Complete(); + } + + // Size stays the exact byte count; only the frontier is page-padded. + Assert.That(location.Size, Is.EqualTo(3)); + + // Next reservation starts at the page-aligned frontier, not right after the data. + byte[] next = [4, 5]; + SnapshotLocation nextLoc; + using (ArenaWriter w = manager.CreateWriter(next.Length)) + { + Span span = w.GetWriter().GetSpan(next.Length); + next.CopyTo(span); + w.GetWriter().Advance(next.Length); + (nextLoc, _) = w.Complete(); + } + Assert.That(nextLoc.Offset, Is.EqualTo(PageLayout.RoundUpToOsPage(location.Offset + location.Size))); + } + + [Test] + public void ArenaManager_DedicatedArena_ShrinksToActualSizeOnComplete() + { + string arenaDir = Path.Combine(_testDir, "arenas"); + // Lower the dedicated threshold so the test doesn't need to allocate 512 MiB. + using ArenaManager manager = new(arenaDir, new FlatDbConfig + { + ArenaFileSizeBytes = 4096, + PersistedSnapshotDedicatedArenaThresholdBytes = 64 * 1024, + }, LimboLogs.Instance); + manager.Initialize([]); + + const long estimate = 256 * 1024; + byte[] data = [1, 2, 3, 4, 5, 6, 7, 8]; + + SnapshotLocation location; + string dedicatedFile; + using (ArenaWriter writer = manager.CreateWriter(estimate)) + { + data.CopyTo(writer.GetWriter().GetSpan(data.Length)); + writer.GetWriter().Advance(data.Length); + (location, _) = writer.Complete(); + dedicatedFile = Directory.GetFiles(arenaDir, "dedicated_*.bin")[0]; + } + + Assert.That(new FileInfo(dedicatedFile).Length, Is.EqualTo(data.Length)); + using WholeReadSession session = manager.Open(location).BeginWholeReadSession(); + Assert.That(TestFixtureHelpers.ReadAll(session), Is.EqualTo(data)); + } + + [Test] + public void ArenaManager_ConcurrentWriters_UseDifferentArenas() + { + string arenaDir = Path.Combine(_testDir, "arenas"); + using ArenaManager manager = new(arenaDir, new FlatDbConfig + { + ArenaFileSizeBytes = 200, + }, LimboLogs.Instance); + manager.Initialize([]); + + byte[] data = [1, 2, 3]; + + using ArenaWriter w1 = manager.CreateWriter(data.Length); + // w1 holds the first arena; w2 must be assigned a different one while w1 is open. + using ArenaWriter w2 = manager.CreateWriter(data.Length); + data.CopyTo(w1.GetWriter().GetSpan(data.Length)); + w1.GetWriter().Advance(data.Length); + data.CopyTo(w2.GetWriter().GetSpan(data.Length)); + w2.GetWriter().Advance(data.Length); + + (SnapshotLocation loc1, _) = w1.Complete(); + (SnapshotLocation loc2, _) = w2.Complete(); + + Assert.That(loc1.ArenaId, Is.Not.EqualTo(loc2.ArenaId)); + } + + [Test] + public void ArenaManager_SmallAndNonSmallWrites_UseSeparateFiles() + { + string arenaDir = Path.Combine(_testDir, "arenas"); + // Ample headroom: without pool separation all three writes would pack into one file. + using ArenaManager manager = new(arenaDir, new FlatDbConfig + { + ArenaFileSizeBytes = 64 * 1024, + }, LimboLogs.Instance); + manager.Initialize([]); + + byte[] data = [1, 2, 3]; + SnapshotLocation large = Write(manager, data, small: false); + SnapshotLocation small = Write(manager, data, small: true); + SnapshotLocation small2 = Write(manager, data, small: true); + + Assert.That(small.ArenaId, Is.Not.EqualTo(large.ArenaId), "small and non-small writes must not share a file"); + Assert.That(small2.ArenaId, Is.EqualTo(small.ArenaId), "consecutive small writes pack into the small pool's file"); + // The "arena_*" glob is prefix-anchored, so it must not catch the "small_arena_*" file. + Assert.That(Directory.GetFiles(arenaDir, "small_arena_*.bin"), Has.Length.EqualTo(1)); + Assert.That(Directory.GetFiles(arenaDir, "arena_*.bin"), Has.Length.EqualTo(1)); + } + + [Test] + public void ArenaManager_SmallArenaFile_SurvivesCatalogRoundTrip() + { + string arenaDir = Path.Combine(_testDir, "arenas"); + FlatDbConfig config = new() + { + ArenaFileSizeBytes = 64 * 1024, + }; + byte[] data = [9, 8, 7, 6, 5]; + StateId from = new(0, Keccak.Compute("from")); + StateId to = new(1, Keccak.Compute("to")); + + SnapshotLocation location; + using (ArenaManager first = new(arenaDir, config, LimboLogs.Instance)) + { + first.Initialize([]); + using ArenaWriter writer = first.CreateWriter(data.Length, small: true); + data.CopyTo(writer.GetWriter().GetSpan(data.Length)); + writer.GetWriter().Advance(data.Length); + (location, ArenaReservation reservation) = writer.Complete(); + // Keep the small_arena_ file on disk past Dispose so the next session can reload it. + reservation.PersistOnShutdown(); + reservation.Dispose(); + } + + // Fresh manager over the same dir, primed with the catalog entry referencing the small file. + // Open succeeds only if Initialize recognized the small_arena_ prefix and loaded the file; + // otherwise the entry is dropped and the arena left unregistered. + CatalogEntry entry = new(from, to, location, SnapshotTier.PersistedBase); + using ArenaManager second = new(arenaDir, config, LimboLogs.Instance); + second.Initialize([entry]); + + using WholeReadSession session = second.Open(location).BeginWholeReadSession(); + Assert.That(TestFixtureHelpers.ReadAll(session), Is.EqualTo(data)); + } + + private static SnapshotLocation Write(ArenaManager manager, byte[] data, bool small) + { + using ArenaWriter writer = manager.CreateWriter(data.Length, small); + data.CopyTo(writer.GetWriter().GetSpan(data.Length)); + writer.GetWriter().Advance(data.Length); + (SnapshotLocation location, _) = writer.Complete(); + return location; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/Sync/Snap/FlatSnapServerTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/Sync/Snap/FlatSnapServerTests.cs index 89dec32c5c13..92f2c37080fe 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/Sync/Snap/FlatSnapServerTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/Sync/Snap/FlatSnapServerTests.cs @@ -12,6 +12,7 @@ using Nethermind.Logging; using Nethermind.Serialization.Rlp; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.State.Flat.Sync.Snap; using Nethermind.State.Snap; using Nethermind.Trie; @@ -48,7 +49,7 @@ public void SetUp() _flatDbManager = Substitute.For(); _flatDbManager.GatherReadOnlySnapshotBundle(_stateId) - .Returns(_ => new ReadOnlySnapshotBundle(new SnapshotPooledList(0), _persistence.CreateReader(), recordDetailedMetrics: false)); + .Returns(_ => new ReadOnlySnapshotBundle(new SnapshotPooledList(0), _persistence.CreateReader(), recordDetailedMetrics: false, PersistedSnapshotStack.Empty())); _stateRootIndex = Substitute.For(); _stateRootIndex.TryGetStateId(Arg.Any(), out Arg.Any()) @@ -95,7 +96,7 @@ public void GetTrieNodes_RespectsHardResponseByteLimitInStorageLoop() _stateId = new StateId(0, _rootHash.ValueHash256); _flatDbManager.GatherReadOnlySnapshotBundle(_stateId) - .Returns(_ => new ReadOnlySnapshotBundle(new SnapshotPooledList(0), _persistence.CreateReader(), recordDetailedMetrics: false)); + .Returns(_ => new ReadOnlySnapshotBundle(new SnapshotPooledList(0), _persistence.CreateReader(), recordDetailedMetrics: false, PersistedSnapshotStack.Empty())); WriteState(stateRootRlp, addressHash, storageRootRlp); diff --git a/src/Nethermind/Nethermind.State.Flat.Test/TestFixtureHelpers.cs b/src/Nethermind/Nethermind.State.Flat.Test/TestFixtureHelpers.cs new file mode 100644 index 000000000000..2cea2d581f4d --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat.Test/TestFixtureHelpers.cs @@ -0,0 +1,134 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.Buffers.Binary; +using System.Collections.Generic; +using Nethermind.Core; +using Nethermind.Db; +using Nethermind.Int256; +using Nethermind.Logging; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.State.Flat.Persistence.BloomFilter; + +namespace Nethermind.State.Flat.Test; + +internal static class TestFixtureHelpers +{ + /// + /// Creates a real over configured for tests: + /// the arena file size is floored to one OS page so tiny test sizes don't trip the mmap minimum. + /// + public static ArenaManager CreateArenaManager(string dir, int arenaSize = 64 * 1024) => + new(dir, new FlatDbConfig + { + ArenaFileSizeBytes = Math.Max(arenaSize, Environment.SystemPageSize), + }, LimboLogs.Instance); + + /// + /// Materialise an entire reservation's bytes through a fresh reader. Test convenience for + /// asserting on small whole-reservation payloads (throws if the reservation exceeds int range). + /// + public static byte[] ReadAll(WholeReadSession session) + { + WholeReadSessionReader reader = session.CreateReader(); + byte[] buf = new byte[checked((int)reader.Length)]; + reader.TryRead(0, buf); + return buf; + } + + /// + /// Read the ref_ids list from the metadata inside + /// and acquire a lease per id on . Mirrors what + /// SnapshotRepository does at load time — the resulting + /// 's CleanUp drops one lease per id, keeping + /// refcounts balanced. No-op when there are no ref_ids (raw test bytes that aren't + /// a real sorted table). + /// + public static void LeaseBlobIds(ArenaReservation reservation, BlobArenaManager blobs) + { + using WholeReadSession session = reservation.BeginWholeReadSession(); + WholeReadSessionReader reader = session.CreateReader(); + ushort[]? ids = ReadRefIdsFromMetadata(in reader); + if (ids is null) return; + foreach (ushort id in ids) + { + if (!blobs.TryLeaseFile(id, out _)) + throw new System.InvalidOperationException( + $"Test fixture's BlobArenaManager has no slot for id {id}; did Build() use a different manager?"); + } + } + + /// + /// Read the snapshot's referenced blob-arena ids (the ref-id records in column + /// ) as a ushort[], or null when + /// there are none (e.g. raw test bytes that aren't a real table). Test-only convenience for + /// asserting the referenced id set; production walks them via PersistedSnapshot's + /// internal ref-ids enumerator. + /// + public static ushort[]? ReadRefIdsFromMetadata(scoped in TReader reader) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + List ids = []; + SortedTableEnumerator e = new(in reader, new Bound(0, reader.Length)); + while (e.MoveNext(in reader)) + { + ReadOnlySpan key = e.CurrentKey; + if (key.Length == 0 || key[0] != PersistedSnapshotKey.RefIdColumn) break; + ids.Add(PersistedSnapshotKey.ReadRefId(key)); + } + return ids.Count == 0 ? null : ids.ToArray(); + } + + /// + /// Write into a fresh reservation on , + /// lease the blob ids referenced by its metadata (skipped when + /// is false) and wrap the result in a + /// over . + /// + public static PersistedSnapshot CreatePersistedSnapshot( + IArenaManager arena, BlobArenaManager blobs, StateId from, StateId to, byte[] data, + bool leaseBlobIds = true) + { + using ArenaWriter writer = arena.CreateWriter(data.Length); + Span span = writer.GetWriter().GetSpan(data.Length); + data.CopyTo(span); + writer.GetWriter().Advance(data.Length); + (_, ArenaReservation reservation) = writer.Complete(); + if (leaseBlobIds) LeaseBlobIds(reservation, blobs); + return new PersistedSnapshot(from, to, reservation, blobs, SnapshotTier.PersistedBase, RefCountedBloomFilter.AlwaysTrue()); + } + + /// + /// Populates with a contiguous run of storage slots + /// [firstSlot, firstSlot + count) on , each carrying a + /// distinct full 32-byte value (see ). + /// + /// + /// Slot indices are stored big-endian, so a run of 65536 consecutive slots shares one + /// 30-byte slot-prefix and forms a single dense prefix group. The values keep a non-zero + /// leading byte so WithoutLeadingZeros() cannot trim them — a full group's inner + /// sub-slot table then stays large enough to exceed an ArenaBufferWriter buffer. + /// + public static void AddSequentialSlots(SnapshotContent content, Address address, int firstSlot, int count) + { + for (int slot = firstSlot; slot < firstSlot + count; slot++) + content.Storages[(address, (UInt256)slot)] = new SlotValue(SequentialSlotValue(slot)); + } + + /// + /// A 32-byte storage value encoding in its trailing four bytes, + /// with a non-zero leading byte so it survives WithoutLeadingZeros() trimming intact. + /// + public static byte[] SequentialSlotValue(int slot) + { + byte[] value = new byte[32]; + value[0] = 0xFF; + BinaryPrimitives.WriteInt32BigEndian(value.AsSpan(28, 4), slot); + return value; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat.Test/TrieNodeCacheTests.cs b/src/Nethermind/Nethermind.State.Flat.Test/TrieNodeCacheTests.cs index 437ec1dbe470..e2c4ad140857 100644 --- a/src/Nethermind/Nethermind.State.Flat.Test/TrieNodeCacheTests.cs +++ b/src/Nethermind/Nethermind.State.Flat.Test/TrieNodeCacheTests.cs @@ -229,18 +229,19 @@ public void Sharding_StorageNodes_ShardByAddressFirstByte() { Hash256 address1 = new("0x1000000000000000000000000000000000000000000000000000000000000000"); Hash256 address2 = new("0x2000000000000000000000000000000000000000000000000000000000000000"); - TreePath path = TreePath.FromHexString("abcd"); + TreePath path1 = TreePath.FromHexString("1000"); + TreePath path2 = TreePath.FromHexString("2000"); Hash256 hash1 = Keccak.Compute([1]); Hash256 hash2 = Keccak.Compute([2]); TransientResource transientResource = _resourcePool.GetCachedResource(ResourcePool.Usage.MainBlockProcessing); - transientResource.Nodes.Set(address1, in path, new TrieNode(NodeType.Leaf, hash1)); - transientResource.Nodes.Set(address2, in path, new TrieNode(NodeType.Leaf, hash2)); + transientResource.Nodes.Set(address1, in path1, new TrieNode(NodeType.Leaf, hash1)); + transientResource.Nodes.Set(address2, in path2, new TrieNode(NodeType.Leaf, hash2)); _cache.Add(transientResource); - Assert.That(_cache.TryGet(address1, in path, hash1, out _), Is.True); - Assert.That(_cache.TryGet(address2, in path, hash2, out _), Is.True); + Assert.That(_cache.TryGet(address1, in path1, hash1, out _), Is.True); + Assert.That(_cache.TryGet(address2, in path2, hash2, out _), Is.True); } [Test] diff --git a/src/Nethermind/Nethermind.State.Flat/AssembledSnapshotResult.cs b/src/Nethermind/Nethermind.State.Flat/AssembledSnapshotResult.cs new file mode 100644 index 000000000000..44c93274745a --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/AssembledSnapshotResult.cs @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.State.Flat.PersistedSnapshots; + +namespace Nethermind.State.Flat; + +public readonly struct AssembledSnapshotResult(SnapshotPooledList inMemory, PersistedSnapshotList persisted) : IDisposable +{ + public SnapshotPooledList InMemory { get; } = inMemory; + public PersistedSnapshotList Persisted { get; } = persisted; + public readonly int SnapshotCount => InMemory.Count + Persisted.Count; + + public readonly void Dispose() + { + InMemory.Dispose(); + Persisted.Dispose(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/CompactionSchedule.cs b/src/Nethermind/Nethermind.State.Flat/CompactionSchedule.cs index ede23888e9d0..662bf03b81dc 100644 --- a/src/Nethermind/Nethermind.State.Flat/CompactionSchedule.cs +++ b/src/Nethermind/Nethermind.State.Flat/CompactionSchedule.cs @@ -12,6 +12,7 @@ namespace Nethermind.State.Flat; public sealed class CompactionSchedule : ICompactionSchedule { private readonly int _compactSize; + private readonly int _maxCompactSize; private readonly long _offset; public CompactionSchedule( @@ -23,18 +24,18 @@ public CompactionSchedule( throw new ArgumentException("Compact size must be a power of 2"); _compactSize = config.CompactSize; + _maxCompactSize = config.PersistedSnapshotMaxCompactSize; ILogger logger = logManager.GetClassLogger(); _offset = ResolveOffset(metadataDb, config, logger); } - public long Offset => _offset; + internal long Offset => _offset; public int GetCompactSize(long blockNumber) { if (_compactSize <= 1 || blockNumber == 0) return 1; - long shifted = blockNumber + _offset; - return (int)Math.Min(shifted & -shifted, _compactSize); + return (int)Math.Min(ShiftedAlignment(blockNumber), _compactSize); } public long NextFullCompactionAfter(long from) @@ -45,6 +46,28 @@ public long NextFullCompactionAfter(long from) return from + distance; } + // The methods below do NOT short-circuit on `_compactSize <= 1` (the "compaction + // disabled" sentinel honoured by GetCompactSize and NextFullCompactionAfter), because + // PersistedSnapshotCompactor runs with its own min/max caps and may legitimately + // operate even when config.CompactSize == 1. + + public bool IsCompactSizeBoundary(long blockNumber) => + GetPersistedSnapshotCompactSize(blockNumber) == _compactSize; + + public bool IsLargeCompactionBoundary(long blockNumber) => + GetPersistedSnapshotCompactSize(blockNumber) > _compactSize; + + public long GetPersistedSnapshotCompactSize(long blockNumber) => + blockNumber == 0 ? 1 : Math.Min(ShiftedAlignment(blockNumber), _maxCompactSize); + + // x & -x (two's-complement lowest-set-bit trick): returns the largest power of 2 + // dividing the offset-shifted block number, used by all boundary checks. + private long ShiftedAlignment(long blockNumber) + { + long shifted = blockNumber + _offset; + return shifted & -shifted; + } + private long ResolveOffset(IDb metadataDb, IFlatDbConfig config, ILogger logger) { if (_compactSize <= 1) return 0; diff --git a/src/Nethermind/Nethermind.State.Flat/FlatDbManager.cs b/src/Nethermind/Nethermind.State.Flat/FlatDbManager.cs index 41d5cce31801..2189e45621f7 100644 --- a/src/Nethermind/Nethermind.State.Flat/FlatDbManager.cs +++ b/src/Nethermind/Nethermind.State.Flat/FlatDbManager.cs @@ -8,6 +8,8 @@ using Nethermind.Db; using Nethermind.Logging; using Nethermind.State.Flat.Persistence; +using Nethermind.Core.Attributes; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.Trie.Pruning; namespace Nethermind.State.Flat; @@ -64,6 +66,7 @@ public FlatDbManager( ISnapshotCompactor snapshotCompactor, ISnapshotRepository snapshotRepository, IPersistenceManager persistenceManager, + IPersistedSnapshotLoader persistedSnapshotLoader, IFlatDbConfig config, IBlocksConfig blocksConfig, ILogManager logManager, @@ -77,6 +80,9 @@ public FlatDbManager( _logger = logManager.GetClassLogger(); _enableDetailedMetrics = enableDetailedMetrics; + // Must run before any background worker or read can access the persisted tier. + persistedSnapshotLoader.Load(); + _compactSize = config.CompactSize; // We assume that the state must be able to be persisted in half the slot time at the very @@ -141,11 +147,7 @@ private async Task RunPersistence(CancellationToken cancellationToken) { await foreach (StateId stateId in _persistenceJobs.Reader.ReadAllAsync(cancellationToken)) { - await NotifyWhenSlow($"Persisting {stateId}", () => - { - PersistIfNeeded(stateId); - return Task.CompletedTask; - }); + await NotifyWhenSlow($"Persisting {stateId}", () => PersistIfNeeded(stateId)); } } catch (OperationCanceledException) @@ -153,14 +155,13 @@ await NotifyWhenSlow($"Persisting {stateId}", () => } } - private void PersistIfNeeded(in StateId latestSnapshot) + private async Task PersistIfNeeded(StateId latestSnapshot) { - _persistenceManager.AddToPersistence(latestSnapshot); + await _persistenceManager.AddToPersistence(latestSnapshot); StateId currentPersistedStateId = _persistenceManager.GetCurrentPersistedStateId(); if (currentPersistedStateId == StateId.PreGenesis) return; - _snapshotRepository.RemoveStatesUntil(currentPersistedStateId); ClearReadOnlyBundleCache(); ReorgBoundaryReached?.Invoke(this, new ReorgBoundaryReached(currentPersistedStateId.BlockNumber)); } @@ -240,16 +241,19 @@ public SnapshotBundle GatherSnapshotBundle(in StateId baseBlock, ResourcePool.Us usage: usage); } + private static readonly StringLabel _depthInMemoryLabel = new("in_memory"); + private static readonly StringLabel _depthPersistedLabel = new("persisted"); + public ReadOnlySnapshotBundle GatherReadOnlySnapshotBundle(in StateId baseBlock) { - // Note to self: The current verdict on trying to use a linked list of snapshots is that it is error prone and - // hard to pull of due to the constantly moving chain making invalidation hard. + // A linked-list snapshot chain was considered but rejected: the constantly moving chain makes + // invalidation error-prone. if (_logger.IsTrace) _logger.Trace($"Gathering {baseBlock}."); if (baseBlock == StateId.PreGenesis) { - // Special case for pregenesis. Note: nethermind always tries to generate genesis. - return new ReadOnlySnapshotBundle(new SnapshotPooledList(0), new NoopPersistenceReader(), _enableDetailedMetrics); + // PreGenesis is a sentinel; Nethermind always generates genesis, so this path is always transient. + return new ReadOnlySnapshotBundle(new SnapshotPooledList(0), new NoopPersistenceReader(), _enableDetailedMetrics, PersistedSnapshotStack.Empty(_enableDetailedMetrics)); } long sw = 0; @@ -272,10 +276,10 @@ public ReadOnlySnapshotBundle GatherReadOnlySnapshotBundle(in StateId baseBlock) } IPersistence.IPersistenceReader persistenceReader = _persistenceManager.LeaseReader(); - SnapshotPooledList snapshots; + AssembledSnapshotResult assembled; try { - snapshots = _snapshotRepository.AssembleSnapshots( + assembled = _snapshotRepository.AssembleSnapshots( baseBlock, persistenceReader.CurrentState, estimatedSize: Math.Max(1, _snapshotRepository.SnapshotCount / _compactSize)); @@ -286,12 +290,11 @@ public ReadOnlySnapshotBundle GatherReadOnlySnapshotBundle(in StateId baseBlock) throw; } - // Empty result + reader not at baseBlock means the path was removed concurrently; // retry unless baseBlock itself was pruned (orphaned), which no retry can recover. - if (snapshots.Count == 0 && persistenceReader.CurrentState != baseBlock) + if (assembled.SnapshotCount == 0 && persistenceReader.CurrentState != baseBlock) { - snapshots.Dispose(); + assembled.Dispose(); persistenceReader.Dispose(); if (!_snapshotRepository.HasState(baseBlock)) @@ -303,9 +306,12 @@ public ReadOnlySnapshotBundle GatherReadOnlySnapshotBundle(in StateId baseBlock) continue; } - if (_logger.IsTrace) _logger.Trace($"Gathered {baseBlock}. Got {snapshots.Count} known states, Reader state: {persistenceReader.CurrentState}. Persistence state: {_persistenceManager.GetCurrentPersistedStateId()}"); + if (_logger.IsTrace) _logger.Trace($"Gathered {baseBlock}. Got {assembled.InMemory.Count} known states, {assembled.Persisted.Count} persisted, Reader state: {persistenceReader.CurrentState}. Persistence state: {_persistenceManager.GetCurrentPersistedStateId()}"); - ReadOnlySnapshotBundle res = new(snapshots, persistenceReader, _enableDetailedMetrics); + ReportBundleMetrics(assembled); + + ReadOnlySnapshotBundle res = new(assembled.InMemory, persistenceReader, _enableDetailedMetrics, + new PersistedSnapshotStack(assembled.Persisted, _enableDetailedMetrics)); res.TryLease(); if (!_readonlySnapshotBundleCache.TryAdd(baseBlock, res)) @@ -313,11 +319,28 @@ public ReadOnlySnapshotBundle GatherReadOnlySnapshotBundle(in StateId baseBlock) res.Dispose(); } - Metrics.SnapshotBundleSize = snapshots.Count; return res; } } + private static void ReportBundleMetrics(in AssembledSnapshotResult assembled) + { + int inMemoryDepth = assembled.InMemory.Count > 0 + ? (int)(assembled.InMemory[^1].To.BlockNumber - assembled.InMemory[0].From.BlockNumber) : 0; + int persistedDepth = assembled.Persisted.Count > 0 + ? (int)(assembled.Persisted[^1].To.BlockNumber - assembled.Persisted[0].From.BlockNumber) : 0; + Metrics.SnapshotBundleBlockNumberDepth.Observe(inMemoryDepth, _depthInMemoryLabel); + Metrics.SnapshotBundleBlockNumberDepth.Observe(persistedDepth, _depthPersistedLabel); + + Metrics.SnapshotBundleSize = assembled.InMemory.Count; + Metrics.SnapshotBundlePersistedSnapshotSize = assembled.Persisted.Count; + + long persistedBytes = 0; + for (int i = 0; i < assembled.Persisted.Count; i++) + persistedBytes += assembled.Persisted[i].Size; + Metrics.SnapshotBundlePersistedSnapshotMemory = persistedBytes; + } + public void AddSnapshot(Snapshot snapshot, TransientResource transientResource) { StateId startingBlock = snapshot.From; @@ -333,7 +356,7 @@ public void AddSnapshot(Snapshot snapshot, TransientResource transientResource) return; } - if (!_snapshotRepository.TryAddSnapshot(snapshot)) + if (!_snapshotRepository.TryAdd(snapshot, SnapshotTier.InMemoryBase)) { if (_logger.IsWarn) _logger.Warn($"State {snapshot.To} already added"); _resourcePool.ReturnCachedResource(ResourcePool.Usage.MainBlockProcessing, transientResource); @@ -419,8 +442,6 @@ public void FlushCache(CancellationToken cancellationToken) if (cancellationToken.IsCancellationRequested) return; if (persistedState.BlockNumber < 0) return; - _snapshotRepository.RemoveStatesUntil(persistedState); - ClearReadOnlyBundleCache(); _trieNodeCache.Clear(); diff --git a/src/Nethermind/Nethermind.State.Flat/ICompactionSchedule.cs b/src/Nethermind/Nethermind.State.Flat/ICompactionSchedule.cs index 0d89094694dc..492ab14026a0 100644 --- a/src/Nethermind/Nethermind.State.Flat/ICompactionSchedule.cs +++ b/src/Nethermind/Nethermind.State.Flat/ICompactionSchedule.cs @@ -19,4 +19,31 @@ public interface ICompactionSchedule /// when compaction is disabled. /// long NextFullCompactionAfter(long from); + + /// + /// True when 's persisted-snapshot window + /// () is exactly CompactSize — a boundary + /// whose only window is the CompactSized one, with no wider (>CompactSize) merge to + /// perform. Mutually exclusive with ; together they + /// cover every persistence boundary. + /// + bool IsCompactSizeBoundary(long blockNumber); + + /// + /// True when 's persisted-snapshot window + /// () is strictly larger than CompactSize — + /// a boundary that carries a wider (>CompactSize) merge on top of the CompactSized + /// window. Mutually exclusive with ; together they cover + /// every persistence boundary. + /// + bool IsLargeCompactionBoundary(long blockNumber); + + /// + /// The persisted-snapshot compaction tier for — the lowest + /// power of 2 that divides blockNumber + Offset, capped at + /// PersistedSnapshotMaxCompactSize. Unlike the cap is + /// PersistedSnapshotMaxCompactSize rather than CompactSize, so callers can act + /// on the wider merge windows (2×, 4×, …) above the persistence boundary. + /// + long GetPersistedSnapshotCompactSize(long blockNumber); } diff --git a/src/Nethermind/Nethermind.State.Flat/IPersistenceManager.cs b/src/Nethermind/Nethermind.State.Flat/IPersistenceManager.cs index eb6446097129..4697b765bd7f 100644 --- a/src/Nethermind/Nethermind.State.Flat/IPersistenceManager.cs +++ b/src/Nethermind/Nethermind.State.Flat/IPersistenceManager.cs @@ -9,7 +9,7 @@ public interface IPersistenceManager { IPersistence.IPersistenceReader LeaseReader(); StateId GetCurrentPersistedStateId(); - void AddToPersistence(StateId latestSnapshot); + Task AddToPersistence(StateId latestSnapshot); StateId FlushToPersistence(); void ResetPersistedStateId(); } diff --git a/src/Nethermind/Nethermind.State.Flat/ISnapshotRepository.cs b/src/Nethermind/Nethermind.State.Flat/ISnapshotRepository.cs index 5eb7efc1d885..fb675955fc5b 100644 --- a/src/Nethermind/Nethermind.State.Flat/ISnapshotRepository.cs +++ b/src/Nethermind/Nethermind.State.Flat/ISnapshotRepository.cs @@ -3,23 +3,104 @@ using System.Diagnostics.CodeAnalysis; using Nethermind.Core.Collections; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.State.Flat.Persistence.BloomFilter; namespace Nethermind.State.Flat; public interface ISnapshotRepository { + /// Number of in-memory base snapshots currently held. int SnapshotCount { get; } - int CompactedSnapshotCount { get; } + /// Total persisted snapshots across the base/compacted/CompactSized buckets. + int PersistedSnapshotCount { get; } + + /// Register as a known in-memory tip: adds it to the block-ordered + /// set and records it as the last-registered tip. void AddStateId(in StateId stateId); - bool TryAddSnapshot(Snapshot snapshot); - bool TryAddCompactedSnapshot(Snapshot snapshot); - bool TryLeaseState(in StateId stateId, [NotNullWhen(true)] out Snapshot? entry); - bool TryLeaseCompactedState(in StateId stateId, [NotNullWhen(true)] out Snapshot? entry); - bool RemoveAndReleaseCompactedKnownState(in StateId stateId); + + /// Add an in-memory snapshot to the store. + /// must be or . + bool TryAdd(Snapshot snapshot, SnapshotTier tier); + + /// Lease the in-memory snapshot at from the + /// store. must be an InMemory* value. + bool TryLeaseInMemoryState(in StateId stateId, SnapshotTier tier, [NotNullWhen(true)] out Snapshot? entry); + + /// Remove and release the in-memory snapshot at from the + /// store. must be an InMemory* value. + bool RemoveAndReleaseInMemoryKnownState(in StateId stateId, SnapshotTier tier); + + /// Whether a snapshot exists at in either the in-memory base store + /// or the persisted base bucket. bool HasState(in StateId stateId); - SnapshotPooledList AssembleSnapshots(in StateId stateId, in StateId targetStateId, int estimatedSize); - SnapshotPooledList AssembleSnapshotsUntil(in StateId stateId, long minBlockNumber, int estimatedSize); + + /// Index a caller-built into the bucket selected by + /// (must be a Persisted* value), acquiring the bucket's own lease. The + /// caller retains its construction lease and is responsible for the catalog entry — a freshly + /// persisted/compacted snapshot writes one; a snapshot reloaded from the catalog does not. + void AddPersistedSnapshot(PersistedSnapshot snapshot, SnapshotTier tier); + + /// Atomically swap the snapshot registered at in 's + /// bucket for , which must wrap the same on-disk reservation. The previous + /// entry's bucket lease is released so its CleanUp runs once any in-flight reader drains. Returns + /// false (leaving unregistered) when no entry is present. + bool ReplacePersistedSnapshot(in StateId to, PersistedSnapshot replacement, SnapshotTier tier); + + /// Adopt (a correct superset pre-filter) across every persisted + /// snapshot fully contained in (from, to], freeing each one's own bloom. Walks the base parent + /// chain from back to ; at each block re-registers a twin + /// over the same reservation carrying a lease on the shared bloom. Best-effort and lock-free across + /// buckets — a racing prune just leaves a snapshot with its own bloom. Pure live-memory optimization: + /// blooms are not persisted, so reload rebuilds independent blooms. + void ShareBloomAcrossRange(StateId from, StateId to, RefCountedBloomFilter sharedBloom, BlobArenaManager blobs); + + /// Lease every persisted base snapshot tiling (from, to]. Caller disposes the list. + PersistedSnapshotList LeaseBaseSnapshotsInRange(StateId from, StateId to); + + /// Whether the persisted base bucket holds a snapshot at . + bool HasBaseSnapshot(in StateId stateId); + + /// Every loaded persisted snapshot across the three buckets, for one-off lifecycle iteration + /// (bloom rebuild) at load time. + IEnumerable PersistedSnapshots { get; } + + /// Flag every persisted snapshot's files as shutdown-preserved so they survive process exit. + /// Must run (across all buckets) before the repository is disposed — a file shared between a base and a + /// compacted snapshot must be flagged before either snapshot is disposed. The implementation's + /// Dispose (invoked by DI) then disposes the snapshots and clears the buckets. + void MarkPersistedTierForShutdown(); + + /// Prune persisted snapshots with To.BlockNumber before the given block number. + void RemovePersistedStatesUntil(long blockNumber); + /// Assemble the backward chain from down to + /// across both tiers, returning the in-memory and persisted snapshots + /// along the winning path (oldest-first). Empty when no path reaches the target; caller disposes the result. + AssembledSnapshotResult AssembleSnapshots(in StateId stateId, in StateId targetStateId, int estimatedSize); + + /// Assemble the backward chain of in-memory snapshots from down to + /// for compaction (widest in-memory edge first). Oldest-first; empty when + /// the terminus is unreachable. Caller disposes the list. + SnapshotPooledList AssembleInMemorySnapshotsForCompaction(in StateId toStateId, long minBlockNumber, int estimatedSize); + + /// + /// Backward BFS from over the two-tier snapshot graph for the first + /// snapshot whose From equals — the next thing + /// to persist. Returns the leased persisted or in-memory snapshot (caller disposes), or + /// (null, null) when none is reachable. + /// + (PersistedSnapshot? Persisted, Snapshot? InMemory) FindSnapshotToPersist(in StateId seed, in StateId currentPersistedState, int compactSize); + + /// + /// Assemble the backward chain of persisted snapshots for compaction from + /// down to (widest persisted edge first). Oldest-first; empty when + /// fewer than two are found. Caller disposes the returned list. + /// + PersistedSnapshotList AssemblePersistedSnapshotsForCompaction(in StateId toStateId, long minBlockNumber); + /// The greatest known across the in-memory ordered set and the + /// persisted-tier maxima (the true cross-tier tip). null when empty. StateId? GetLastSnapshotId(); /// @@ -37,9 +118,19 @@ public interface ISnapshotRepository /// Returns the most recently committed state, or null if nothing was committed this session. StateId? GetLastCommittedStateId(); - bool TryFindAncestorStateAtBlock(in StateId head, long blockNumber, out StateId ancestor); + /// All registered in-memory state ids at (a fork can have + /// several). Caller disposes the list. ArrayPoolList GetStatesAtBlockNumber(long blockNumber); - void RemoveStatesUntil(in StateId currentPersistedStateId); + + /// All registered in-memory state ids with BlockNumber up to and including + /// . Caller disposes the list. + ArrayPoolList GetStatesUpToBlock(long blockNumber); + + /// Remove every snapshot a persist to supersedes: in-memory + /// snapshots (both tiers) with To.BlockNumber up to and including , + /// and persisted-tier snapshots with To.BlockNumber strictly below it (the base at the persisted + /// block stays until the state advances past it). Folds in . + void RemoveStatesUntil(long blockNumber); /// /// Removes in-memory snapshots belonging to non-canonical forks that persisting diff --git a/src/Nethermind/Nethermind.State.Flat/Importer.cs b/src/Nethermind/Nethermind.State.Flat/Importer.cs index 30774f3adfaf..4d728f200ba9 100644 --- a/src/Nethermind/Nethermind.State.Flat/Importer.cs +++ b/src/Nethermind/Nethermind.State.Flat/Importer.cs @@ -16,7 +16,7 @@ namespace Nethermind.State.Flat; /// /// Imports state from trie-based persistence to flat persistence. /// -/// This importer uses SetAccountRaw/SetStorageRaw with hash-based keys. For PreimageFlat mode, +/// This importer uses SetAccountRaw/SetStorageRawEncoded with hash-based keys. For PreimageFlat mode, /// wrap the persistence with PreimageRecordingPersistence and provide a previously recorded /// preimage database - it will automatically translate raw operations to preimage-keyed operations. /// @@ -81,7 +81,7 @@ public async Task Copy(StateId to, CancellationToken cancellationToken = default await Task.WhenAll(tasks.AsSpan()); - // Finally, we increment the state id + // An empty write batch from→to advances the persisted state ID to `to` without writing any data entries. IPersistence.IWriteBatch writeBatch = persistence.CreateWriteBatch(from, to); writeBatch.Dispose(); persistence.Flush(); diff --git a/src/Nethermind/Nethermind.State.Flat/Io/IByteBufferWriter.cs b/src/Nethermind/Nethermind.State.Flat/Io/IByteBufferWriter.cs new file mode 100644 index 000000000000..57379469bb21 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/Io/IByteBufferWriter.cs @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.Io; + +public interface IByteBufferWriter +{ + Span GetSpan(int sizeHint); + void Advance(int count); + long Written { get; } + + /// + /// Smallest writer-local offset (in the same coordinate system as + /// ) that maps to a 4 KiB-aligned byte in the writer's + /// eventual destination. Callers can pad to the next 4 KiB boundary with + /// (-(Written - FirstOffset)) & PageLayout.PageMask. For writers whose backing + /// destination has no inherent alignment (e.g. transient in-memory buffers), + /// implementations may return 0. + /// + long FirstOffset { get; } + + static void Copy(ref TWriter writer, ReadOnlySpan value) where TWriter : IByteBufferWriter + { + while (value.Length > 0) + { + int chunk = Math.Min(value.Length, 256); + value[..chunk].CopyTo(writer.GetSpan(chunk)); + writer.Advance(chunk); + value = value[chunk..]; + } + } +} + diff --git a/src/Nethermind/Nethermind.State.Flat/Io/IByteReader.cs b/src/Nethermind/Nethermind.State.Flat/Io/IByteReader.cs new file mode 100644 index 000000000000..d2b15fc86b79 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/Io/IByteReader.cs @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.Io; + +/// +/// Absolute offset + length region within an . +/// +public readonly record struct Bound(long Offset, long Length); + +/// +/// Pin handle returned by : combines a +/// disposable release primitive with the pinned span itself. +/// Implementations may be ref structs so the buffer's lifetime is tracked by the compiler. +/// +public interface IBufferPin : IDisposable +{ + ReadOnlySpan Buffer { get; } +} + +/// +/// No-op pin for readers that can return zero-copy spans (e.g. ): +/// holds the span directly, no release work. +/// +public readonly ref struct NoOpPin(ReadOnlySpan buffer) : IBufferPin +{ + public ReadOnlySpan Buffer { get; } = buffer; + public void Dispose() { } +} + +/// +/// Random-access byte source over a fixed region, generic over the +/// pin handle type so readers can return their own zero-allocation, non-virtual pin +/// (no-op for in-memory, pooled-array for copy fallback, page refcount for paged stores, etc.). +/// The pinned buffer is exposed via . +/// +/// +/// Pin handle type returned by . Must be a struct implementing +/// ; allows ref struct permits readers to return ref-struct +/// pins (e.g. ones that hold a span directly). +/// +public interface IByteReader where TPin : struct, IBufferPin, allows ref struct +{ + long Length { get; } + + /// + /// Copy output.Length bytes starting at into . + /// Returns false if the range is out of bounds. + /// + bool TryRead(long offset, scoped Span output); + + /// + /// Pin the window described by (absolute offset + length). + /// The pinned bytes are accessed via and remain valid until + /// the returned pin is disposed. + /// + TPin PinBuffer(Bound bound); + + /// + /// Software-prefetch hint for the cache line(s) at . No-op for readers + /// without a stable base pointer; pointer-backed readers issue a real prefetch. + /// + void Prefetch(long offset); +} diff --git a/src/Nethermind/Nethermind.State.Flat/Io/IByteReaderSource.cs b/src/Nethermind/Nethermind.State.Flat/Io/IByteReaderSource.cs new file mode 100644 index 000000000000..abfc5d9383f8 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/Io/IByteReaderSource.cs @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.Io; + +/// +/// Factory for an over a fixed byte region. Readers are +/// typically ref structs and cannot be cached as fields, so consumers that need to traverse the +/// same region more than once (the persisted-snapshot scanner, the N-way merger) hold a small +/// value-type source and mint a fresh reader per use. +/// +public interface IByteReaderSource + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct +{ + TReader CreateReader(); +} diff --git a/src/Nethermind/Nethermind.State.Flat/Io/PooledByteBufferWriter.cs b/src/Nethermind/Nethermind.State.Flat/Io/PooledByteBufferWriter.cs new file mode 100644 index 000000000000..78ac354991ad --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/Io/PooledByteBufferWriter.cs @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Runtime.InteropServices; + +namespace Nethermind.State.Flat.Io; + +public sealed class PooledByteBufferWriter(int initialCapacity, long firstOffset = 0) : IDisposable +{ + private Writer _writer = new(initialCapacity, firstOffset); + + public ref Writer GetWriter() => ref _writer; + public ReadOnlySpan WrittenSpan => _writer.WrittenSpan; + + /// Resets the write cursor to 0 without releasing the backing buffer. + public void Reset() => _writer.Reset(); + + public void Dispose() => _writer.ReturnBuffer(); + + public unsafe struct Writer : IByteBufferWriter + { + private byte* _buffer; + private int _capacity; + private int _written; + private readonly long _firstOffset; + + internal Writer(int initialCapacity, long firstOffset) + { + _capacity = initialCapacity; + _buffer = initialCapacity == 0 ? null : (byte*)NativeMemory.Alloc((nuint)initialCapacity); + _firstOffset = firstOffset; + } + + public Span GetSpan(int sizeHint) + { + int remaining = _capacity - _written; + if (sizeHint > remaining) Grow(sizeHint); + return new Span(_buffer + _written, _capacity - _written); + } + + public void Advance(int count) => _written += count; + public readonly long Written => _written; + public readonly long FirstOffset => _firstOffset; + public readonly ReadOnlySpan WrittenSpan => new(_buffer, _written); + + /// Rewind the cursor to 0; keeps the backing buffer for reuse. + public void Reset() => _written = 0; + + private void Grow(int sizeHint) + { + int needed = _written + sizeHint; + int newSize = Math.Max(needed, _capacity == 0 ? 1 : _capacity * 2); + + byte* newBuffer = (byte*)NativeMemory.Alloc((nuint)newSize); + if (_written > 0) + { + Buffer.MemoryCopy(_buffer, newBuffer, newSize, _written); + } + if (_buffer is not null) NativeMemory.Free(_buffer); + _buffer = newBuffer; + _capacity = newSize; + } + + internal void ReturnBuffer() + { + byte* buffer = _buffer; + _buffer = null; + _capacity = 0; + if (buffer is not null) NativeMemory.Free(buffer); + } + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/Io/SpanByteReader.cs b/src/Nethermind/Nethermind.State.Flat/Io/SpanByteReader.cs new file mode 100644 index 000000000000..87869f55795e --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/Io/SpanByteReader.cs @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.Io; + +/// +/// Span-backed . Stored as a ref struct so the underlying +/// span's lifetime is tracked by the compiler — no raw pointers, no GC pinning concerns. +/// +public readonly ref struct SpanByteReader : IByteReader +{ + private readonly ReadOnlySpan _data; + + public SpanByteReader(ReadOnlySpan data) => _data = data; + + public long Length => _data.Length; + + public bool TryRead(long offset, scoped Span output) + { + if ((ulong)offset > (ulong)(_data.Length - output.Length)) return false; + _data.Slice((int)offset, output.Length).CopyTo(output); + return true; + } + + public NoOpPin PinBuffer(Bound bound) + { + if ((ulong)bound.Offset + (ulong)bound.Length > (ulong)_data.Length) + throw new ArgumentOutOfRangeException(nameof(bound)); + return new NoOpPin(_data.Slice((int)bound.Offset, (int)bound.Length)); + } + + public readonly void Prefetch(long offset) { } +} diff --git a/src/Nethermind/Nethermind.State.Flat/Metrics.cs b/src/Nethermind/Nethermind.State.Flat/Metrics.cs index b7bca823ec80..d6203fd7ae7b 100644 --- a/src/Nethermind/Nethermind.State.Flat/Metrics.cs +++ b/src/Nethermind/Nethermind.State.Flat/Metrics.cs @@ -15,6 +15,14 @@ public static class Metrics [Description("Average snapshot bundle size in terms of num of snapshot")] public static long SnapshotBundleSize { get; set; } + [GaugeMetric] + [Description("Number of persisted snapshots in the most recently assembled snapshot bundle")] + public static long SnapshotBundlePersistedSnapshotSize { get; set; } + + [GaugeMetric] + [Description("Total persisted-snapshot reservation bytes in the most recently assembled read-only snapshot bundle (the bytes a tip reader pays for)")] + public static long SnapshotBundlePersistedSnapshotMemory { get; set; } + [DetailedMetric] [Description("Time for persistence job")] [ExponentialPowerHistogramMetric(Start = 1, Factor = 1.5, Count = 30)] @@ -84,4 +92,163 @@ public static class Metrics [ExponentialPowerHistogramMetric(Start = 1, Factor = 1.5, Count = 1, LabelNames = [])] public static IMetricObserver CompactTime { get; set; } = new NoopMetricObserver(); + // --- Persisted snapshot metrics --- + // + // The tier-labeled gauges below are mutated delta-wise by PersistedSnapshotBucket at every + // add/remove site (via .AddBy(tier, delta)), so callers must not recompute or overwrite them — + // they stay correct only as long as every mutation goes through the repo. + + [GaugeMetric] + [Description("Number of persisted snapshots on disk, by tier")] + [KeyIsLabel("tier", "size")] + public static ConcurrentDictionary PersistedSnapshotCount { get; } = new(); + + [GaugeMetric] + [Description("Estimated memory used by persisted snapshots in bytes, by tier")] + [KeyIsLabel("tier", "size")] + public static ConcurrentDictionary PersistedSnapshotMemory { get; } = new(); + + // Backed by a field so callers can update via Interlocked.Add(ref ...). + internal static long _persistedSnapshotBloomMemory; + + [GaugeMetric] + [Description("Memory used by per-snapshot blooms (address/slot/self-destruct/trie) in bytes")] + public static long PersistedSnapshotBloomMemory + { + get => Volatile.Read(ref _persistedSnapshotBloomMemory); + set => Volatile.Write(ref _persistedSnapshotBloomMemory, value); + } + + // Backed by a field so callers can update via Interlocked.Increment/Decrement(ref ...). + internal static long _persistedSnapshotBloomCount; + + [DetailedMetric] + [GaugeMetric] + [Description("Number of live persisted-snapshot bloom filters (one per RefCountedBloomFilter; a bloom shared across snapshots counts once)")] + public static long PersistedSnapshotBloomCount + { + get => Volatile.Read(ref _persistedSnapshotBloomCount); + set => Volatile.Write(ref _persistedSnapshotBloomCount, value); + } + + [DetailedMetric] + [CounterMetric] + [Description("Number of persisted snapshot compactions performed")] + public static long PersistedSnapshotCompactions { get; set; } + + internal static long _persistedSnapshotPrunes; + + [DetailedMetric] + [CounterMetric] + [Description("Number of persisted snapshot prunes")] + public static long PersistedSnapshotPrunes + { + get => Volatile.Read(ref _persistedSnapshotPrunes); + set => Volatile.Write(ref _persistedSnapshotPrunes, value); + } + + // Push-style gauges for the persisted-snapshot arena/blob storage. Two separate gauge + // families: arena files (mmap-backed metadata) versus blob files (pread-only RLP), so + // bytes can be attributed to one or the other from the dashboard. + // + // Bytes are reported as **allocated** (sum of `Frontier` across open files) — i.e. bytes + // actually written, not the pre-extended sparse mmap region. Arena/Blob managers push + // deltas (via Interlocked on the backing fields) on every writer.Complete + on file + // open/close. + internal static long _arenaFileCount; + + [GaugeMetric] + [Description("Number of arena (mmap metadata) files backing persisted snapshots")] + public static long ArenaFileCount + { + get => Volatile.Read(ref _arenaFileCount); + set => Volatile.Write(ref _arenaFileCount, value); + } + + internal static long _arenaAllocatedBytes; + + [GaugeMetric] + [Description("Allocated bytes in arena files (sum of per-file Frontier)")] + public static long ArenaAllocatedBytes + { + get => Volatile.Read(ref _arenaAllocatedBytes); + set => Volatile.Write(ref _arenaAllocatedBytes, value); + } + + internal static long _blobFileCount; + + [GaugeMetric] + [Description("Number of blob (pread RLP) files backing persisted snapshots")] + public static long BlobFileCount + { + get => Volatile.Read(ref _blobFileCount); + set => Volatile.Write(ref _blobFileCount, value); + } + + internal static long _blobAllocatedBytes; + + [GaugeMetric] + [Description("Allocated bytes in blob files (sum of per-file Frontier)")] + public static long BlobAllocatedBytes + { + get => Volatile.Read(ref _blobAllocatedBytes); + set => Volatile.Write(ref _blobAllocatedBytes, value); + } + + [GaugeMetric] + [Description("Number of live PersistedSnapshot instances (refcount > 0), by tier")] + [KeyIsLabel("tier", "size")] + public static ConcurrentDictionary ActivePersistedSnapshotCount { get; } = new(); + + internal static long _arenaReservationCount; + + [DetailedMetric] + [GaugeMetric] + [Description("Live arena reservations")] + public static long ArenaReservationCount + { + get => Volatile.Read(ref _arenaReservationCount); + set => Volatile.Write(ref _arenaReservationCount, value); + } + + internal static long _arenaReservationBytes; + + [DetailedMetric] + [GaugeMetric] + [Description("Live arena reservation bytes")] + public static long ArenaReservationBytes + { + get => Volatile.Read(ref _arenaReservationBytes); + set => Volatile.Write(ref _arenaReservationBytes, value); + } + + [DetailedMetric] + [Description("Snapshot-bundle depth in blocks, by part (in_memory / persisted)")] + [ExponentialPowerHistogramMetric(LabelNames = ["part"], Start = 1, Factor = 1.5, Count = 30)] + public static IMetricObserver SnapshotBundleBlockNumberDepth { get; set; } = new NoopMetricObserver(); + + [DetailedMetric] + [Description("Time spent skipping accounts/slots/state-rlp/storage-rlp on a read-only snapshot bundle access, by part")] + [ExponentialPowerHistogramMetric(LabelNames = ["part"], Start = 1, Factor = 1.5, Count = 30)] + public static IMetricObserver ReadOnlySnapshotBundleSkipTime { get; set; } = new NoopMetricObserver(); + + [DetailedMetric] + [Description("Time to convert one in-memory snapshot into a persisted snapshot")] + [ExponentialPowerHistogramMetric(Start = 1, Factor = 1.5, Count = 30)] + public static IMetricObserver PersistedSnapshotConvertTime { get; set; } = new NoopMetricObserver(); + + [DetailedMetric] + [Description("Persisted-snapshot byte size")] + [ExponentialPowerHistogramMetric(Start = 1, Factor = 1.5, Count = 30)] + public static IMetricObserver PersistedSnapshotSize { get; set; } = new NoopMetricObserver(); + + [DetailedMetric] + [Description("Persisted-snapshot compaction output size, by compact size")] + [ExponentialPowerHistogramMetric(LabelNames = ["size"], Start = 1, Factor = 1.5, Count = 30)] + public static IMetricObserver PersistedSnapshotCompactedSize { get; set; } = new NoopMetricObserver(); + + [DetailedMetric] + [Description("Persisted-snapshot compaction wall-clock time, by compact size")] + [ExponentialPowerHistogramMetric(LabelNames = ["size"], Start = 1, Factor = 1.5, Count = 30)] + public static IMetricObserver PersistedSnapshotCompactTime { get; set; } = new NoopMetricObserver(); } diff --git a/src/Nethermind/Nethermind.State.Flat/NodeRef.cs b/src/Nethermind/Nethermind.State.Flat/NodeRef.cs new file mode 100644 index 000000000000..d85e630ea12f --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/NodeRef.cs @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers.Binary; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Nethermind.State.Flat; + +/// +/// Reference to a trie-node RLP stored in a blob arena file. Persisted snapshots +/// store only metadata table locally; the RLP bytes live in a separate blob arena +/// file addressed by . +/// +[StructLayout(LayoutKind.Sequential, Pack = 1)] +public readonly struct NodeRef(ushort blobArenaId, int rlpDataOffset) +{ + public const int Size = 6; + + /// + /// ID of the blob arena file holding the RLP bytes (equals ArenaFile.Id). + /// 16-bit, so the per-tier file count is capped at ushort.MaxValue; with the + /// 2 GiB-per-file ceiling from that is ~128 TiB per tier. + /// + public ushort BlobArenaId { get; } = blobArenaId; + + /// + /// File-absolute byte offset of the RLP item's first byte. Length is recovered by parsing the + /// RLP header, so no per-entry length is stored. 32-bit caps a single blob arena file at 2 GiB + /// (enforced by on append). + /// + public int RlpDataOffset { get; } = rlpDataOffset; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static NodeRef Read(ReadOnlySpan data) + { + ushort id = BinaryPrimitives.ReadUInt16LittleEndian(data); + int offset = BinaryPrimitives.ReadInt32LittleEndian(data[2..]); + return new NodeRef(id, offset); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Write(Span data, in NodeRef nodeRef) + { + BinaryPrimitives.WriteUInt16LittleEndian(data, nodeRef.BlobArenaId); + BinaryPrimitives.WriteInt32LittleEndian(data[2..], nodeRef.RlpDataOffset); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PageLayout.cs b/src/Nethermind/Nethermind.State.Flat/PageLayout.cs new file mode 100644 index 000000000000..5080b344f15d --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PageLayout.cs @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat; + +/// +/// Page-alignment constants shared by the flat-state on-disk writers. The 4 KiB page size +/// matches the typical OS page granularity targeted by the mmap-backed arenas; writers +/// pad to this size so a single value (trie-node RLP in a blob arena, sorted-table block) +/// never straddles a page that the reader would have to fault in just to splice across +/// the seam. +/// +public static class PageLayout +{ + /// Logical page size for blob-arena and sorted-table index alignment. + public const int PageSize = 4096; + + /// + /// Bitmask companion to for computing in-page offsets: + /// offsetInPage = absoluteOffset & PageMask. Typed as + /// because callers mask file-absolute offsets that may exceed 31 bits. + /// + public const long PageMask = PageSize - 1; + + /// + /// Bytes-to-next-page threshold below which the sorted-table builder pads up to the next + /// page boundary before writing the next node. The page-crossing heuristic stops a + /// node growing into the next page; padding eats the small leftover so the next + /// node opens on a fresh page. Threshold is intentionally large so most splits earn + /// the alignment; nodes finalised well inside their page (gap > threshold) skip + /// padding to avoid writing kilobytes of zeros. + /// + public const int PadThreshold = 64; + + /// + /// OS memory-page size — the granularity of madvise / posix_fadvise / + /// fallocate(PUNCH_HOLE). Distinct from , the fixed 4 KiB + /// logical page used for on-disk node alignment. + /// + public static readonly int OsPageSize = Environment.SystemPageSize; + + public static long RoundUpToOsPage(long value) => (value + OsPageSize - 1) & ~((long)OsPageSize - 1); +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/IPersistedSnapshotCompactor.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/IPersistedSnapshotCompactor.cs new file mode 100644 index 000000000000..6dcf39a308ed --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/IPersistedSnapshotCompactor.cs @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core.Collections; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +public interface IPersistedSnapshotCompactor : IAsyncDisposable +{ + /// + /// Enqueue a batch of newly-converted persisted-snapshot s for + /// background compaction. + /// + /// + /// Takes ownership of and disposes it once the batch has been + /// processed (or drained on cancellation). Asynchronously awaits a free slot when the internal + /// queue is full, providing backpressure to the block-processing pipeline without blocking a + /// thread. + /// + /// The converted states to compact; ownership transfers to the compactor. + /// The current persistence point (RocksDB persisted state block). + /// Compaction windows are clamped to not reach below it — snapshots below are already in RocksDB, + /// so merging them would be wasted work. + /// Releases the backpressure wait when the producer is shutting down. + ValueTask EnqueueAsync(ArrayPoolList batch, long persistedBlockNumber, CancellationToken cancellationToken); +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/IPersistedSnapshotLoader.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/IPersistedSnapshotLoader.cs new file mode 100644 index 000000000000..14f85f2e5e47 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/IPersistedSnapshotLoader.cs @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Owns the lifecycle of the 's persisted tier: loads it from the +/// catalog at startup () and tears it down at shutdown (). +/// +public interface IPersistedSnapshotLoader : IDisposable +{ + /// Drives the repository's persisted tier from empty to fully populated; called once at startup. + void Load(); + + /// Persists an in-memory as a base entry in the repository's persisted tier. + void ConvertAndRegister(Snapshot snapshot); +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullPersistedSnapshotCompactor.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullPersistedSnapshotCompactor.cs new file mode 100644 index 000000000000..5cae3e131237 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullPersistedSnapshotCompactor.cs @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core.Collections; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// No-op wired alongside +/// when the long-finality feature is +/// disabled, so the rest of the persistence pipeline can resolve a compactor +/// without spinning up real arena-backed compaction work. +/// +public sealed class NullPersistedSnapshotCompactor : IPersistedSnapshotCompactor +{ + public static readonly NullPersistedSnapshotCompactor Instance = new(); + + private NullPersistedSnapshotCompactor() { } + + // Dispose immediately — no compaction work, but ownership still transfers so callers don't leak. + public ValueTask EnqueueAsync(ArrayPoolList batch, long persistedBlockNumber, CancellationToken cancellationToken) + { + batch.Dispose(); + return ValueTask.CompletedTask; + } + + // Shared singleton: disposal must be a safe no-op so a container or forwarding caller + // can dispose it without breaking the shared instance. + public ValueTask DisposeAsync() => ValueTask.CompletedTask; +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullPersistedSnapshotLoader.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullPersistedSnapshotLoader.cs new file mode 100644 index 000000000000..5ce95702c95a --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullPersistedSnapshotLoader.cs @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// No-op wired when long finality is disabled: it neither loads an +/// existing persisted tier at startup nor converts in-memory snapshots into it, so the tier stays empty. +/// +public sealed class NullPersistedSnapshotLoader : IPersistedSnapshotLoader +{ + public static readonly NullPersistedSnapshotLoader Instance = new(); + + private NullPersistedSnapshotLoader() { } + + public void Load() { } + + public void ConvertAndRegister(Snapshot snapshot) { } + + // Shared singleton: disposal must be a safe no-op. + public void Dispose() { } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullSnapshotCatalog.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullSnapshotCatalog.cs new file mode 100644 index 000000000000..318f10ba5f16 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/NullSnapshotCatalog.cs @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.State.Flat.PersistedSnapshots.Storage; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// No-op wired alongside and +/// when long finality is disabled: the persisted tier is +/// always empty, so nothing is recorded, removed, or loaded. +/// +public sealed class NullSnapshotCatalog : ISnapshotCatalog +{ + public static readonly NullSnapshotCatalog Instance = new(); + + private NullSnapshotCatalog() { } + + public void Add(CatalogEntry entry) { } + + public bool Remove(in StateId to, long depth) => false; + + public IEnumerable Load() => []; +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshot.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshot.cs new file mode 100644 index 000000000000..6bce0706bcd1 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshot.cs @@ -0,0 +1,374 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Runtime.InteropServices; +using Nethermind.Core; +using Nethermind.Core.Collections; +using Nethermind.Core.Crypto; +using Nethermind.Core.Utils; +using Nethermind.Int256; +using Nethermind.Serialization.Rlp; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.Trie; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// A persisted snapshot backed by a single-level on disk. Trie-node RLP +/// values are not stored inline — every trie-node entry holds a pointing into +/// a blob arena. The reservation owned by this snapshot stores the metadata table bytes only. +/// +/// +/// On-disk vocabulary (column / subcolumn tags, metadata keys, value markers) is defined in +/// and materialized by . +/// Each lookup binary searches the table, but the index search is clamped to the entity's column using +/// the per-column data-block bounds precalculated at construction (). +/// +public sealed class PersistedSnapshot : SmallRefCountingDisposable +{ + private readonly ArenaReservation _reservation; + // Metric label (tier + compact size) for the per-(tier, size) ActivePersistedSnapshotCount gauge. + private readonly PersistedSnapshotLabel _label; + // Each id is resolved on demand via _blobManager.GetFile(id), a lock-free O(1) array read. The + // canonical leased-id list lives on disk in this snapshot's metadata under the "ref_ids" key. + private readonly BlobArenaManager _blobManager; + // Per-column data-block bounds precalculated once at construction, so a point lookup clamps the + // stage-1 index search to its column instead of scanning the whole index (and reuses the footer). + private readonly PersistedSnapshotColumnBounds _columnBounds; + + public StateId From { get; } + public StateId To { get; } + + /// The persisted tier (bucket) this snapshot belongs to. + internal SnapshotTier Tier { get; } + + // Unified bloom gating all reads of this snapshot (address / slot / self-destruct keys and + // state- / storage-trie paths in one filter), held through a ref-counted owner so a large + // compaction can share one filter across the snapshots it contains. + private readonly RefCountedBloomFilter _bloom; + public BloomFilter Bloom => _bloom.Filter; + + /// The ref-counted bloom owner, for re-registering a twin over this snapshot that shares + /// another snapshot's bloom (the twin adopts a lease on that owner). + internal RefCountedBloomFilter BloomRef => _bloom; + + /// + /// The contiguous trie-RLP region this snapshot occupies in its blob arena, used to prefetch + /// the whole region in one bulk read-ahead. Non-empty only for base snapshots (which write all + /// their RLPs through one ); for + /// compacted / CompactSized snapshots, whose NodeRefs scatter across many blob arenas. + /// + public BlobRange BlobRange { get; } + + public long Size => _reservation.Size; + + internal ArenaReservation Reservation => _reservation; + + /// + /// Begin a scoped whole-buffer read over this snapshot's reservation. By default the + /// session madvises the mmap range cold on dispose. + /// + public WholeReadSession BeginWholeReadSession(bool adviseDontNeedOnDispose = true) => + _reservation.BeginWholeReadSession(adviseDontNeedOnDispose); + + private ArenaByteReader CreateReader() => _reservation.CreateReader(); + + /// + /// Construct a snapshot over a pre-leased metadata reservation. The caller MUST have already + /// acquired one lease per blob arena id referenced by the snapshot's ref_ids metadata, + /// and is responsible for rolling those leases back on construction failure. This ctor bumps the + /// metadata reservation lease and stashes the manager ref for later id → file resolution. + /// + public PersistedSnapshot(StateId from, StateId to, ArenaReservation reservation, + BlobArenaManager blobManager, SnapshotTier tier, RefCountedBloomFilter bloom) + { + From = from; + To = to; + Tier = tier; + _reservation = reservation; + _label = new PersistedSnapshotLabel(tier.MetricTierLabel(), to.BlockNumber - from.BlockNumber); + _blobManager = blobManager; + _bloom = bloom; + _reservation.AcquireLease(); + + // Walk the on-disk ref_ids stream once and lease each referenced blob arena file. On + // partial failure we walk the prefix already acquired and drop those leases before + // unwinding the metadata reservation's lease and rethrowing. + int acquired = 0; + try + { + ArenaByteReader metaReader = _reservation.CreateReader(); + Bound table = new(0, metaReader.Length); + BlobRange = ReadBlobRange(in metaReader, table); + // Resolve each column's data-block run once (≈8 index ceiling searches) so subsequent point + // lookups clamp the index search to their column. Absent (empty table) → plain whole-table seek. + _columnBounds = PersistedSnapshotColumnBounds.Compute(in metaReader, table); + + RefIdsEnumerator e = GetRefIdsEnumerator(); + while (e.MoveNext()) + { + if (!_blobManager.TryLeaseFile(e.Current, out _)) + throw new InvalidOperationException($"Blob arena {e.Current} not registered with the blob manager"); + acquired++; + } + } + catch + { + int released = 0; + RefIdsEnumerator e = GetRefIdsEnumerator(); + while (released < acquired && e.MoveNext()) + { + _blobManager.GetFile(e.Current).Dispose(); + released++; + } + _bloom.Dispose(); + _reservation.Dispose(); + throw; + } + + // Increment only after every throw path above has been cleared, so a + // partial-construction failure does not leave the gauge off by one. + Metrics.ActivePersistedSnapshotCount.AddBy(_label, 1); + } + + /// Seek a metadata entry (column 0xFF) by its NUL-padded name and return its + /// value bound, or a default bound if absent. + private static Bound SeekMetadata(scoped in TReader reader, Bound table, scoped ReadOnlySpan name) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + Span key = stackalloc byte[1 + PersistedSnapshotTags.MetadataKeyLength]; + int len = PersistedSnapshotKey.WriteMetadataKey(key, name); + return SortedTableReader.TrySeek(in reader, table, key[..len], out Bound b) ? b : default; + } + + /// + /// Forward iterator over this snapshot's referenced blob arena ids, reading the ref_ids value a + /// little-endian ushort at a time. Backed by a plain — the + /// surrounding snapshot's lease keeps the mmap alive. + /// + private RefIdsEnumerator GetRefIdsEnumerator() + { + ArenaByteReader reader = _reservation.CreateReader(); + return new RefIdsEnumerator(reader, new Bound(0, reader.Length)); + } + + /// + /// Read the blob_range metadata entry — the contiguous trie-RLP run recorded by base + /// snapshots. Returns when the key is absent (compacted / + /// CompactSized snapshots) or malformed. + /// + private static BlobRange ReadBlobRange(scoped in ArenaByteReader reader, Bound table) + { + Bound b = SeekMetadata(in reader, table, PersistedSnapshotTags.MetadataBlobRangeKey); + if (b.Length == BlobRange.SerializedSize) + { + BlobRange range = default; + if (reader.TryRead(b.Offset, MemoryMarshal.AsBytes(new Span(ref range)))) + return range; + } + return BlobRange.None; + } + + /// + /// Ref-struct enumerator backing . Yields each referenced + /// by walking the ref-id records (column + /// ), which sort first in the table, and stopping + /// at the first non-ref-id record. + /// + private ref struct RefIdsEnumerator + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct + { + private TReader _reader; + private SortedTableEnumerator _inner; + private ushort _current; + + internal RefIdsEnumerator(TReader reader, Bound table) + { + _reader = reader; + _inner = new SortedTableEnumerator(in reader, table); + } + + public readonly ushort Current => _current; + + public bool MoveNext() + { + if (!_inner.MoveNext(in _reader)) return false; + ReadOnlySpan key = _inner.CurrentKey; + if (key.Length == 0 || key[0] != PersistedSnapshotKey.RefIdColumn) return false; + _current = PersistedSnapshotKey.ReadRefId(key); + return true; + } + + public RefIdsEnumerator GetEnumerator() => this; + } + + public bool TryGetAccount(Address address, out Account? account) + { + ArenaByteReader reader = CreateReader(); + if (!PersistedSnapshotReader.TryGetAccount( + in reader, new Bound(0, reader.Length), in _columnBounds, address, out Bound b)) + { + account = null; + return false; + } + int bLenInt = checked((int)b.Length); + Span buf = bLenInt <= 256 ? stackalloc byte[256] : new byte[bLenInt]; + Span rlp = buf[..bLenInt]; + reader.TryRead(b.Offset, rlp); + return PersistedSnapshotPerAddress.TryDecodeAccount(rlp, out account); + } + + public bool TryGetSlot(Address address, in UInt256 index, ref SlotValue slotValue) + { + ArenaByteReader reader = CreateReader(); + if (!PersistedSnapshotReader.TryGetSlot( + in reader, new Bound(0, reader.Length), in _columnBounds, address, in index, out Bound b)) + return false; + Span buf = stackalloc byte[PersistedSnapshotTags.RlpSlotValueBufferSize]; + Span raw = buf[..checked((int)b.Length)]; + reader.TryRead(b.Offset, raw); + // length 0 = null/deleted slot (empty payload); a present value is RLP-wrapped. + ReadOnlySpan value = raw.Length == 0 ? raw : new Rlp.ValueDecoderContext(raw).DecodeByteArraySpan(); + slotValue = SlotValue.FromSpanWithoutLeadingZero(value); + return true; + } + + public bool? TryGetSelfDestructFlag(Address address) + { + ArenaByteReader reader = CreateReader(); + return PersistedSnapshotReader.TryGetSelfDestructFlag( + in reader, new Bound(0, reader.Length), in _columnBounds, address); + } + + public bool TryLoadStateNodeRlp(scoped in TreePath path, out byte[]? nodeRlp) + { + ArenaByteReader reader = CreateReader(); + if (!PersistedSnapshotReader.TryLoadStateNodeRlp( + in reader, new Bound(0, reader.Length), in _columnBounds, in path, out Bound bound)) + { + nodeRlp = null; + return false; + } + nodeRlp = ResolveTrieRlp(bound); + return true; + } + + public bool TryLoadStorageNodeRlp(in ValueHash256 addressHash, in TreePath path, out byte[]? nodeRlp) + { + ArenaByteReader reader = CreateReader(); + if (!PersistedSnapshotReader.TryLoadStorageNodeRlp( + in reader, new Bound(0, reader.Length), in _columnBounds, in addressHash, in path, out Bound bound)) + { + nodeRlp = null; + return false; + } + nodeRlp = ResolveTrieRlp(bound); + return true; + } + + // Worst-case Merkle-Patricia branch node: 17 entries × (1-byte prefix + 32-byte hash) + // plus a 3-byte long-list framing header ≈ 564 bytes. Round up to 568 so the read + // covers any branch node in one pread. + private const int MaxTrieNodeRlpBytes = 568; + + private byte[] ReadBlobArenaRlp(ushort blobArenaId, int offset) + { + BlobArenaFile file = _blobManager.GetFile(blobArenaId); + Span buf = stackalloc byte[MaxTrieNodeRlpBytes]; + int bytesRead = file.RandomRead(offset, buf); + Rlp.ValueDecoderContext ctx = new(buf[..bytesRead]); + int totalLength = ctx.PeekNextRlpLength(); + if (totalLength > bytesRead) + throw new InvalidDataException( + $"Trie-node RLP at blob arena {blobArenaId}+{offset} declares {totalLength} bytes " + + $"but only {bytesRead} were read (MaxTrieNodeRlpBytes = {MaxTrieNodeRlpBytes})."); + byte[] result = new byte[totalLength]; + buf[..totalLength].CopyTo(result); + return result; + } + + /// + /// Materialise the trie-node RLP at , which holds a + /// pointing at the actual RLP bytes in a blob arena. + /// + internal byte[] ResolveTrieRlp(Bound localBound) + { + NodeRef nodeRef = default; + Span nr = MemoryMarshal.AsBytes(new Span(ref nodeRef))[..checked((int)localBound.Length)]; + ArenaByteReader reader = _reservation.CreateReader(); + reader.TryRead(localBound.Offset, nr); + return ReadBlobArenaRlp(nodeRef.BlobArenaId, nodeRef.RlpDataOffset); + } + + internal void AdviseDontNeed() => _reservation.AdviseDontNeed(); + + /// + /// Issue posix_fadvise(WILLNEED) over this base snapshot's contiguous trie-RLP region so + /// the kernel prefetches it ahead of a random-access read pass. No-op for compacted / CompactSized + /// snapshots () or empty regions. + /// + public void AdviseWillNeedBlobRange() + { + if (BlobRange.IsEmpty) return; + _blobManager.GetFile(BlobRange.BlobArenaId).FadviseWillNeed(BlobRange.Offset, BlobRange.Length); + } + + /// + /// Issue posix_fadvise(DONTNEED) over this base snapshot's contiguous trie-RLP region, + /// dropping it from the OS page cache. No-op for compacted / CompactSized snapshots + /// () or empty regions. + /// + public void AdviseDontNeedBlobRange() + { + if (BlobRange.IsEmpty) return; + _blobManager.GetFile(BlobRange.BlobArenaId).FadviseDontNeed(BlobRange.Offset, BlobRange.Length); + } + + public bool TryAcquire() => TryAcquireLease(); + + /// + /// Advise this snapshot's mmap range cold and clear the per-arena page-tracker entries that + /// cover it. A hook for callers that have superseded this snapshot but want to drop its resident + /// pages eagerly rather than waiting for full disposal. + /// + public void Demote() => _reservation.AdviseAndFadviseDontNeed(); + + /// + /// Mark every file this snapshot references (its metadata 's + /// and every leased ) for + /// shutdown-preservation. Reads the leased id list from the metadata on each call; idempotent + /// and safe to call from any thread. + /// + public void PersistOnShutdown() + { + _reservation.PersistOnShutdown(); + foreach (ushort id in GetRefIdsEnumerator()) + _blobManager.GetFile(id).PersistOnShutdown(); + } + + protected override void CleanUp() + { + // Drain the iterator before disposing the reservation — the iterator reads through the + // reservation's mmap via an ArenaByteReader, and this snapshot's own lease (acquired at + // construction) keeps the mmap alive until it drops at the end of CleanUp. + foreach (ushort id in GetRefIdsEnumerator()) + { + BlobArenaFile file = _blobManager.GetFile(id); + file.Dispose(); + // Opportunistic reclaim: if we were the last external lessee, signal the manager to + // drop the file's frontier back to 0. + if (file.HasOnlyManagerLease) + _blobManager.TryResetOrphanedFrontier(file); + } + _reservation.Dispose(); + + _bloom.Dispose(); + + Metrics.ActivePersistedSnapshotCount.AddBy(_label, -1); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBloomBuilder.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBloomBuilder.cs new file mode 100644 index 000000000000..7f441d19ef12 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBloomBuilder.cs @@ -0,0 +1,194 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Int256; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.Trie; +using WholeReadScanner = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotScanner< + Nethermind.State.Flat.PersistedSnapshots.Storage.WholeReadSession, + Nethermind.State.Flat.PersistedSnapshots.Storage.WholeReadSessionReader, + Nethermind.State.Flat.Io.NoOpPin>; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +internal static class PersistedSnapshotBloomBuilder +{ + /// + /// Build the unified bloom for — covers address / + /// slot / self-destruct keys plus state-trie and storage-trie paths in a single + /// filter. Reads bytes through the caller-owned ; this + /// method does not dispose it. + /// + internal static BloomFilter Build(WholeReadSession session, PersistedSnapshot snapshot, double bitsPerKey) + { + WholeReadScanner scanner = PersistedSnapshotScanner.ForWholeRead(session, snapshot); + + // Pass 1: count keys to size the bloom accurately. + long capacity = 0; + foreach (WholeReadScanner.PerAddressEntry entry in scanner.PerAddresses) + { + if (entry.HasAccount) capacity++; + if (entry.SelfDestructFlag is not null) capacity++; + } + foreach (WholeReadScanner.SlotEntry _ in scanner.Slots) + capacity += 2; // address key + (address, slot) key + foreach (WholeReadScanner.StateNodeEntry _ in scanner.StateNodes) + capacity++; + foreach (WholeReadScanner.StorageNodeEntry _ in scanner.StorageNodes) + capacity++; + + if (capacity == 0) + capacity = 1; + + BloomFilter bloom = new(capacity, bitsPerKey); + + // Pass 2: populate. Account / self-destruct address keys. + foreach (WholeReadScanner.PerAddressEntry entry in scanner.PerAddresses) + { + ulong addrKey = AddressKey(entry.Address); + if (entry.HasAccount) + bloom.Add(addrKey); + if (entry.SelfDestructFlag is not null) + bloom.Add(addrKey); + } + // Slot keys (address key + (address, slot) key) from the slot column. + foreach (WholeReadScanner.SlotEntry slot in scanner.Slots) + { + ulong addrKey = AddressKey(slot.AddressSpan); + bloom.Add(addrKey); + bloom.Add(SlotKey(addrKey, slot.Slot)); + } + // Trie-node keys (state + storage). + foreach (WholeReadScanner.StateNodeEntry entry in scanner.StateNodes) + bloom.Add(StatePathKey(entry.Path)); + foreach (WholeReadScanner.StorageNodeEntry entry in scanner.StorageNodes) + bloom.Add(StorageNodeKey(entry.AddressHash, entry.Path)); + + return bloom; + } + + /// + /// Bloom-key seed from the first 8 bytes of a raw 20-byte Address. The account / self-destruct + /// (0xFE) and slot (0xFD) columns store the raw Address right after the 1-byte column tag, so the + /// merger can read the seed directly from the outer key via + /// . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong AddressKey(Address address) => + AddressKey(address.Bytes); + + /// + /// Span overload of — used by the builder loop, + /// which iterates raw 20-byte slices in a NativeMemoryList without materialising + /// an object per row. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong AddressKey(scoped ReadOnlySpan addressBytes) => + MemoryMarshal.Read(addressBytes); + + /// + /// Slot bloom hash: XORs the full 32-byte big-endian slot into the address key. + /// Serialises the once and routes through the span variant + /// so both call sites share the exact hash bytes. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong SlotKey(ulong addressKey, in UInt256 slot) + { + Span slotBytes = stackalloc byte[32]; + slot.ToBigEndian(slotBytes); + return SlotKey(addressKey, slotBytes); + } + + /// + /// Span-based slot bloom hash: XORs the 32-byte big-endian slot into the + /// address key as four non-overlapping ulongs covering [0,8), [8,16), + /// [16,24), [24,32). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong SlotKey(ulong addressKey, scoped ReadOnlySpan slot32) + { + ulong s0 = MemoryMarshal.Read(slot32); + ulong s1 = MemoryMarshal.Read(slot32[8..]); + ulong s2 = MemoryMarshal.Read(slot32[16..]); + ulong s3 = MemoryMarshal.Read(slot32[24..]); + return addressKey ^ s0 ^ s1 ^ s2 ^ s3; + } + + /// + /// Bloom key for a state-trie node, hashed from the same encoded byte-sequence + /// that the writer stores on disk (3-byte form for length 0–5, 8-byte for 6–15, + /// 33-byte fallback for 16+). Routing through the encoding makes the key + /// independent of whether the arrived canonical or with a + /// non-zero tail, and matches the path the scanner reconstructs on reload. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong StatePathKey(in TreePath path) + { + Span encoded = stackalloc byte[33]; + int length = path.Length; + if (length <= 5) + path.EncodeWith3Byte(encoded[..3]); + else if (length <= 15) + path.EncodeWith8Byte(encoded[..8]); + else + { + path.Path.Bytes.CopyTo(encoded); + encoded[32] = (byte)length; + } + ulong p0 = MemoryMarshal.Read(encoded); + ulong p1 = MemoryMarshal.Read(encoded[8..]); + ulong p2 = MemoryMarshal.Read(encoded[16..]); + ulong p3 = MemoryMarshal.Read(encoded[24..]); + return p0 ^ p1 ^ p2 ^ p3 ^ encoded[32]; + } + + /// + /// Bloom key for a storage-trie node. Storage has no top tier (it matches the persistence layout): + /// the path is encoded 8-byte for length 0–15 and 33-byte fallback for 16+ — distinct from state, + /// so it does not route through . The encoded path bytes are + /// fed to the span hasher so the build/query hash equals the merger's hash of the raw on-disk key. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong StorageNodeKey(in ValueHash256 addressHash, in TreePath path) + { + Span encoded = stackalloc byte[33]; + int len; + if (path.Length <= 15) + { + path.EncodeWith8Byte(encoded[..8]); + len = 8; + } + else + { + path.Path.Bytes.CopyTo(encoded); + encoded[32] = (byte)path.Length; + len = 33; + } + return MemoryMarshal.Read(addressHash.Bytes) ^ StatePathKey(encoded[..len]); + } + + /// + /// Span-based path hasher for callers (the merger) that see raw encoded column keys rather than + /// reconstructed s. Used for both state and storage path portions. Byte- + /// equivalent to the overloads: the 3-byte state-top, 8-byte compact, and + /// 33-byte fallback keys are exactly what EncodeWith3Byte/EncodeWith8Byte (and the + /// fallback [path.Path.Bytes][length]) produce. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong StatePathKey(scoped ReadOnlySpan encodedKey) + { + Span encoded = stackalloc byte[33]; + encoded.Clear(); + encodedKey.CopyTo(encoded); + ulong p0 = MemoryMarshal.Read(encoded); + ulong p1 = MemoryMarshal.Read(encoded[8..]); + ulong p2 = MemoryMarshal.Read(encoded[16..]); + ulong p3 = MemoryMarshal.Read(encoded[24..]); + return p0 ^ p1 ^ p2 ^ p3 ^ encoded[32]; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBucket.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBucket.cs new file mode 100644 index 000000000000..4a6a02b27876 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBucket.cs @@ -0,0 +1,185 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Collections.Concurrent; +using System.Diagnostics.CodeAnalysis; +using Nethermind.Core.Collections; +using Nethermind.Logging; +using Nethermind.State.Flat.PersistedSnapshots.Storage; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// One self-contained snapshot bucket for a single persisted : a To-keyed +/// for lock-free point lookups, a block-ordered +/// of its Tos, and running memory/count totals — all guarded by +/// the bucket's own . The bucket owns its share of the shared catalog and the +/// process-wide memory/count metrics, so insert/prune/remove are end-to-end here. +/// +/// +/// Totals are read lock-free via ; the dictionary serves +/// point lookups lock-free. The lock only serialises ordered-set mutation, catalog writes, and +/// the lease/dispose handoff so a racing prune cannot dispose an entry between insert and return. +/// +internal sealed class PersistedSnapshotBucket(ISnapshotCatalog catalog, SnapshotTier tier, ILogger logger) +{ + private readonly ConcurrentDictionary _byTo = new(); + private readonly SortedSet _ordered = []; + private readonly Lock _lock = new(); + private long _memoryBytes; + private long _count; + private readonly string _tierName = tier.MetricTierLabel(); + + public long MemoryBytes => Interlocked.Read(ref _memoryBytes); + public long Count => Interlocked.Read(ref _count); + + /// The greatest To held by this bucket, or null when empty. + public StateId? Max + { + get { using Lock.Scope scope = _lock.EnterScope(); return _ordered.Count == 0 ? null : _ordered.Max; } + } + + private PersistedSnapshotLabel LabelFor(PersistedSnapshot snapshot) => + new(_tierName, snapshot.To.BlockNumber - snapshot.From.BlockNumber); + + /// Live snapshots, for one-off lifecycle iteration (bloom rebuild) at construction. + /// Enumerates the dictionary directly — does not allocate a Values snapshot. + public IEnumerable Snapshots + { + get + { + foreach (KeyValuePair kv in _byTo) + yield return kv.Value; + } + } + + public bool TryGet(in StateId to, [NotNullWhen(true)] out PersistedSnapshot? snapshot) => + _byTo.TryGetValue(to, out snapshot); + + public bool ContainsKey(in StateId to) => _byTo.ContainsKey(to); + + /// + /// Insert or overwrite the snapshot at , under this bucket's lock so the + /// dictionary and the ordered set stay consistent against a concurrent catalog load or racing prune. + /// + public void Set(in StateId to, PersistedSnapshot snapshot) + { + using Lock.Scope scope = _lock.EnterScope(); + _byTo[to] = snapshot; + _ordered.Add(to); + Interlocked.Add(ref _memoryBytes, snapshot.Size); + Interlocked.Increment(ref _count); + PersistedSnapshotLabel label = LabelFor(snapshot); + Metrics.PersistedSnapshotMemory.AddBy(label, snapshot.Size); + Metrics.PersistedSnapshotCount.AddBy(label, 1); + } + + /// + /// Like but also pre-acquires the caller's lease under the same lock, so a + /// racing prune cannot dispose the entry between insert and return. The catalog entry is written + /// by the caller, not here. + /// + public void Add(in StateId to, PersistedSnapshot snapshot) + { + using Lock.Scope scope = _lock.EnterScope(); + Set(to, snapshot); + snapshot.AcquireLease(); + } + + public bool Replace(in StateId to, PersistedSnapshot replacement) + { + using Lock.Scope scope = _lock.EnterScope(); + if (!_byTo.TryGetValue(to, out PersistedSnapshot? old)) return false; + replacement.AcquireLease(); + _byTo[to] = replacement; + old.Dispose(); + return true; + } + + /// Remove the entry at (catalog + index + leases) under this + /// bucket's lock. Returns true when an entry was present. + public bool RemoveExact(in StateId to) + { + using Lock.Scope scope = _lock.EnterScope(); + return RemoveLocked(to); + } + + /// + /// Prune the block-ordered prefix whose To.BlockNumber < beforeBlock, removing each + /// entry (catalog + index + leases) under this bucket's lock. + /// + public void PruneBefore(long beforeBlock) + { + using Lock.Scope scope = _lock.EnterScope(); + // Materialise the prefix first — the removal loop mutates the ordered set. + using ArrayPoolList toRemove = new(0); + foreach (StateId to in _ordered) + { + if (to.BlockNumber >= beforeBlock) break; + toRemove.Add(to); + } + foreach (StateId to in toRemove) RemoveLocked(to); + } + + /// Copy this bucket's Tos in the inclusive [, + /// ] range into , under this bucket's lock. + public void CollectRange(in StateId min, in StateId max, ISet into) + { + using Lock.Scope scope = _lock.EnterScope(); + foreach (StateId to in _ordered.GetViewBetween(min, max)) + into.Add(to); + } + + /// Mark every live snapshot's files shutdown-preserved, under this bucket's lock. + /// Must complete across all buckets before any . + public void PersistAllOnShutdown() + { + using Lock.Scope scope = _lock.EnterScope(); + foreach (KeyValuePair kv in _byTo) + kv.Value.PersistOnShutdown(); + } + + /// Dispose every live snapshot, clear the index, and roll back this bucket's + /// contribution to the global memory/count gauges. Under this bucket's lock. + public void DisposeAndClear() + { + using Lock.Scope scope = _lock.EnterScope(); + if (logger.IsDebug && _byTo.Count > 0) logger.Debug($"Releasing {_byTo.Count} persisted snapshot(s) ({_tierName}) on teardown"); + foreach (KeyValuePair kv in _byTo) + { + PersistedSnapshotLabel label = LabelFor(kv.Value); + Metrics.PersistedSnapshotMemory.AddBy(label, -kv.Value.Size); + Metrics.PersistedSnapshotCount.AddBy(label, -1); + kv.Value.Dispose(); + } + _byTo.Clear(); + _ordered.Clear(); + Interlocked.Exchange(ref _memoryBytes, 0); + Interlocked.Exchange(ref _count, 0); + } + + /// + /// Remove from the index + catalog, dispose its leases, and roll back + /// the bucket and global totals (bumping the prune metric). This bucket's lock must be held. + /// + private bool RemoveLocked(in StateId to) + { + _ordered.Remove(to); + if (!_byTo.TryRemove(to, out PersistedSnapshot? snapshot)) return false; + // Capture depth before Dispose — From/To stay valid on the still-alive object, but the + // underlying reservation/file leases are released by Dispose. The catalog key scopes the + // removal to this bucket's entry (the other buckets' entries at the same To carry a + // different depth and stay put). + long depth = to.BlockNumber - snapshot.From.BlockNumber; + Interlocked.Add(ref _memoryBytes, -snapshot.Size); + Interlocked.Decrement(ref _count); + PersistedSnapshotLabel label = LabelFor(snapshot); + Metrics.PersistedSnapshotMemory.AddBy(label, -snapshot.Size); + Metrics.PersistedSnapshotCount.AddBy(label, -1); + Interlocked.Increment(ref Metrics._persistedSnapshotPrunes); + catalog.Remove(to, depth); + if (logger.IsDebug) logger.Debug($"Released persisted snapshot {_tierName} {snapshot.From.BlockNumber}->{to.BlockNumber}"); + snapshot.Dispose(); + return true; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBuilder.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBuilder.cs new file mode 100644 index 000000000000..1c2acd6bb1a3 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotBuilder.cs @@ -0,0 +1,397 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers; +using System.Buffers.Binary; +using Collections.Pooled; +using Nethermind.Core; +using Nethermind.Core.Collections; +using Nethermind.Core.Crypto; +using Nethermind.Core.Extensions; +using Nethermind.Int256; +using Nethermind.Serialization.Rlp; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.Trie; +using AccountState = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotPerAddress.AccountState; +using SelfDestructState = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotPerAddress.SelfDestructState; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Builds a single-level from an in-memory : every +/// entity becomes one fully-materialized mapped to a small inline +/// value. Trie-node RLP values are stored as s pointing into blob arenas; +/// account / slot / self-destruct / metadata values are inlined. +/// +/// +/// State nodes bucket into top (3-byte, length 0–5) / compact (8-byte, 6–15) / fallback (16+); storage +/// nodes bucket into compact (8-byte, 0–15) / fallback (16+) — no top tier — matching the persistence +/// () key layout. The materialized keys are streamed to a +/// in strictly ascending key order — the builder enforces the +/// order rather than sorting — so emits by ascending column (ref-id, storage, state, +/// slots, per-address, metadata), merging the storage sublists. The key encoding stores column / subcolumn +/// tag bytes as 255 − tag so that plain ascending order reproduces the reverse-tag emission order. +/// +public static class PersistedSnapshotBuilder +{ + private const int StateTopPathThreshold = 5; + private const int CompactPathThreshold = 15; + + private static readonly Comparison StateNodeComparer = (a, b) => + { + int cmp = a.Path.Bytes.SequenceCompareTo(b.Path.Bytes); + return cmp != 0 ? cmp : a.Length.CompareTo(b.Length); + }; + + // Sorts storage-trie node keys by 20-byte address-hash prefix (matching the column outer key) + // and then by encoded path so per-addressHash slices are contiguous and emitted in sorted order. + private static readonly Comparison<(ValueHash256 AddrHash, TreePath Path)> StorageNodeComparer = (a, b) => + { + int cmp = a.AddrHash.Bytes[..PersistedSnapshotTags.AddressHashPrefixLength].SequenceCompareTo(b.AddrHash.Bytes[..PersistedSnapshotTags.AddressHashPrefixLength]); + if (cmp != 0) return cmp; + cmp = a.Path.Path.Bytes.SequenceCompareTo(b.Path.Path.Bytes); + return cmp != 0 ? cmp : a.Path.Length.CompareTo(b.Path.Length); + }; + + // Sorts slot entries by raw Address bytes then by slot value, so per-address slices are + // contiguous and slot keys within a slice are in sorted big-endian order. + private static readonly Comparison<((ValueAddress Addr, UInt256 Slot) Key, SlotValue? Value)> StoragesByAddressComparer = (a, b) => + { + int cmp = a.Key.Addr.AsSpan.SequenceCompareTo(b.Key.Addr.AsSpan); + if (cmp != 0) return cmp; + return a.Key.Slot.CompareTo(b.Key.Slot); + }; + + private static readonly Comparison ValueAddressComparer = (a, b) => + a.AsSpan.SequenceCompareTo(b.AsSpan); + + public static void Build(Snapshot snapshot, ref TWriter writer, BlobArenaWriter blobWriter, BloomFilter bloom) where TWriter : IByteBufferWriter + { + // To stay off the LOH, we keep only the unmanaged sort keys in NativeMemoryList + // (off-heap) and re-fetch the TrieNode value from the source ConcurrentDictionary + // at write time. PooledSet is used for the small Address dedup map so its + // backing entry array is pool-rented rather than freshly allocated each block. + NativeMemoryList stateTopKeys = null!, stateCompactKeys = null!, stateFallbackKeys = null!; + NativeMemoryList<(ValueHash256 AddrHash, TreePath Path)> storCompactKeys = null!, storFallbackKeys = null!; + NativeMemoryList<((ValueAddress Addr, UInt256 Slot) Key, SlotValue? Value)> sortedStorages = null!; + NativeMemoryList uniqueAddresses = null!; + + // Parallel extraction + sort: three independent jobs over disjoint dictionaries. + Parallel.Invoke( + () => + { + NativeMemoryList top = new(0); + NativeMemoryList compact = new(snapshot.StateNodesCount); + NativeMemoryList fallback = new(0); + foreach (KeyValuePair, TrieNode> kv in snapshot.StateNodes) + { + if (kv.Value.FullRlp.Length == 0 && kv.Value.NodeType == NodeType.Unknown) continue; + TreePath path = kv.Key; + if (path.Length <= StateTopPathThreshold) top.Add(path); + else if (path.Length <= CompactPathThreshold) compact.Add(path); + else fallback.Add(path); + kv.Value.IsPersisted = true; + kv.Value.PrunePersistedRecursively(1); + } + Parallel.Invoke( + () => top.Sort(StateNodeComparer), + () => compact.Sort(StateNodeComparer), + () => fallback.Sort(StateNodeComparer)); + stateTopKeys = top; stateCompactKeys = compact; stateFallbackKeys = fallback; + }, + () => + { + NativeMemoryList<(ValueHash256, TreePath)> compact = new(snapshot.StorageNodesCount); + NativeMemoryList<(ValueHash256, TreePath)> fallback = new(0); + foreach (KeyValuePair, TrieNode> kv in snapshot.StorageNodes) + { + if (kv.Value.FullRlp.Length == 0 && kv.Value.NodeType == NodeType.Unknown) continue; + (Hash256 addr, TreePath path) = kv.Key.Key; + ValueHash256 addrHash = addr.ValueHash256; + if (path.Length <= CompactPathThreshold) compact.Add((addrHash, path)); + else fallback.Add((addrHash, path)); + kv.Value.IsPersisted = true; + kv.Value.PrunePersistedRecursively(1); + } + Parallel.Invoke( + () => compact.Sort(StorageNodeComparer), + () => fallback.Sort(StorageNodeComparer)); + storCompactKeys = compact; storFallbackKeys = fallback; + }, + () => + { + using PooledSet> seen = new(); + foreach (KeyValuePair, Account?> kv in snapshot.Accounts) + seen.Add(kv.Key); + foreach (KeyValuePair, bool> kv in snapshot.SelfDestructedStorageAddresses) + seen.Add(kv.Key); + + NativeMemoryList<((ValueAddress Addr, UInt256 Slot) Key, SlotValue? Value)> storages = + new(Math.Max(1, snapshot.StoragesCount)); + foreach (KeyValuePair, SlotValue?> kv in snapshot.Storages) + { + (Address addr, UInt256 slot) = kv.Key.Key; + storages.Add(((new ValueAddress(addr.Bytes), slot), kv.Value)); + seen.Add(addr); + } + + NativeMemoryList addresses = new(Math.Max(1, seen.Count)); + foreach (HashedKey
addr in seen) + addresses.Add(new ValueAddress(addr.Key.Bytes)); + addresses.Sort(ValueAddressComparer); + + storages.Sort(StoragesByAddressComparer); + + sortedStorages = storages; + uniqueAddresses = addresses; + }); + + SortedTableBuilder table = new(ref writer); + try + { + // Records are streamed in strictly ascending key order (the builder enforces it), so emit + // by ascending column: ref-id (0x00), storage nodes (0xF9), state fallback/compact/top + // (0xFA/0xFB/0xFC), slots (0xFD), per-address self-destruct/account (0xFE), metadata + // (0xFF). Metadata is last so its blob_range records the now-final blob-arena run; the + // ref-id is first but only needs the (fixed) blob-arena id. + WriteRefId(ref table, blobWriter); + WriteStorageNodes(ref table, snapshot, storFallbackKeys, storCompactKeys, blobWriter, bloom); + WriteStateNodes(ref table, snapshot, stateFallbackKeys, blobWriter, bloom); + WriteStateNodes(ref table, snapshot, stateCompactKeys, blobWriter, bloom); + WriteStateNodes(ref table, snapshot, stateTopKeys, blobWriter, bloom); + WriteSlots(ref table, sortedStorages, bloom); + WritePerAddress(ref table, snapshot, uniqueAddresses, bloom); + WriteMetadata(ref table, snapshot, blobWriter); + + table.Build(); + } + finally + { + table.Dispose(); + sortedStorages?.Dispose(); + uniqueAddresses?.Dispose(); + stateTopKeys?.Dispose(); + stateCompactKeys?.Dispose(); + stateFallbackKeys?.Dispose(); + storCompactKeys?.Dispose(); + storFallbackKeys?.Dispose(); + } + } + + /// + /// Upper bound on the serialized snapshot size, used to pre-size the destination arena. The + /// in-memory snapshot size bounds it comfortably: the metadata table stores only compact keys, + /// small inline values, and 6-byte s (the trie-node RLP it references lives in + /// the blob arena), so the serialized table is far smaller than the in-memory snapshot it is built + /// from. There is no artificial 2 GiB ceiling — the streaming + /// builds tables past 2 GiB and the arena is + /// long-addressed. + /// + public static long EstimateSize(Snapshot snapshot) => snapshot.EstimateMemory() + 1.KiB; + + /// + /// Emit slot records (column 0xFD) in ascending key order from the globally (addr, slot)-sorted + /// list. Slots have their own top-level column that sorts just before the per-address account + /// column, so this is a single straight pass — no interleaving with accounts / self-destructs. + /// + private static void WriteSlots( + ref SortedTableBuilder table, + NativeMemoryList<((ValueAddress Addr, UInt256 Slot) Key, SlotValue? Value)> sortedStorages, + BloomFilter bloom) where TWriter : IByteBufferWriter + { + // Slot RLP (≤ RlpSlotValueBufferSize); table.Add copies each value out immediately. + byte[] rlpBuffer = ArrayPool.Shared.Rent(PersistedSnapshotTags.RlpSlotValueBufferSize); + Span keyBuf = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + Span slotKey = stackalloc byte[32]; + + try + { + for (int i = 0; i < sortedStorages.Count; i++) + { + // Copy the address into a local first: a span over a NativeMemoryList-indexer temporary + // (ValueAddress.AsSpan uses Unsafe.AsRef on the struct's storage) would dangle once the + // next indexer read reuses that slot. + ValueAddress addr = sortedStorages[i].Key.Addr; + SlotValue? value = sortedStorages[i].Value; + sortedStorages[i].Key.Slot.ToBigEndian(slotKey); + // Full 32-byte big-endian slot inline — no prefix/suffix split. The per-address bloom + // address key is added by WritePerAddress (uniqueAddresses covers slot-only addresses). + bloom.Add(PersistedSnapshotBloomBuilder.SlotKey( + PersistedSnapshotBloomBuilder.AddressKey(addr.AsSpan), slotKey)); + // Present values are RLP-wrapped; null/deleted slots keep an empty payload so the + // length-0 = absent sentinel survives. + ReadOnlySpan payload = value.HasValue + ? rlpBuffer.AsSpan(0, Rlp.Encode(value.Value.AsReadOnlySpan.WithoutLeadingZeros(), rlpBuffer)) + : []; + int len = PersistedSnapshotKey.WriteSlotKey(keyBuf, addr.AsSpan, slotKey); + table.Add(keyBuf[..len], payload); + } + } + finally + { + ArrayPool.Shared.Return(rlpBuffer); + } + } + + private static void WritePerAddress( + ref SortedTableBuilder table, Snapshot snapshot, + NativeMemoryList uniqueAddresses, + BloomFilter bloom) where TWriter : IByteBufferWriter + { + // The combined [account, selfdestruct] value fits in 256 bytes; table.Add copies each value out immediately. + byte[] rlpBuffer = ArrayPool.Shared.Rent(256); + RlpStream rlpStream = new(rlpBuffer); + Span keyBuf = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + + try + { + for (int addrIdx = 0; addrIdx < uniqueAddresses.Count; addrIdx++) + { + ValueAddress addrValue = uniqueAddresses[addrIdx]; + ReadOnlySpan addressBytes = addrValue.AsSpan; + Address address = addrValue.ToAddress(); + + bloom.Add(PersistedSnapshotBloomBuilder.AddressKey(addressBytes)); + + AccountState accountState; + if (snapshot.TryGetAccount(address, out Account? account)) + accountState = account is null ? AccountState.Deleted : AccountState.Present; + else + accountState = AccountState.Absent; + + SelfDestructState selfDestruct = SelfDestructState.None; + if (snapshot.Content.SelfDestructedStorageAddresses.TryGetValue(address, out bool sdValue)) + selfDestruct = sdValue ? SelfDestructState.New : SelfDestructState.Destructed; + + // Storage-only address (slots but neither account change nor self-destruct): nothing + // to store in the account column — the bloom address key above already covers it. + if (accountState == AccountState.Absent && selfDestruct == SelfDestructState.None) continue; + + int keyLen = PersistedSnapshotKey.WriteAccountKey(keyBuf, addressBytes); + rlpStream.Reset(); + int valueLen = PersistedSnapshotPerAddress.Encode(rlpStream, accountState, account, selfDestruct); + table.Add(keyBuf[..keyLen], rlpBuffer.AsSpan(0, valueLen)); + } + } + finally + { + ArrayPool.Shared.Return(rlpBuffer); + } + } + + private static void WriteStateNodes( + ref SortedTableBuilder table, Snapshot snapshot, + NativeMemoryList keys, BlobArenaWriter blobWriter, BloomFilter bloom) where TWriter : IByteBufferWriter + { + Span keyBuf = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + Span nrBuf = stackalloc byte[NodeRef.Size]; + for (int i = 0; i < keys.Count; i++) + { + TreePath path = keys[i]; + if (!snapshot.TryGetStateNode(path, out TrieNode? node) || node is null) + throw new InvalidOperationException($"State node {path} disappeared between extraction and persist."); + NodeRef nr = blobWriter.WriteRlp(node.FullRlp.AsSpan()); + NodeRef.Write(nrBuf, in nr); + int len = PersistedSnapshotKey.WriteStateNodeKey(keyBuf, in path); + table.Add(keyBuf[..len], nrBuf); + bloom.Add(PersistedSnapshotBloomBuilder.StatePathKey(in path)); + } + } + + /// + /// Emit storage-trie nodes (column 0xFA) in ascending key order via a 2-way merge of the + /// fallback / compact sublists. The sub-column byte (fallback 0xFD < compact 0xFE) follows the + /// 20-byte address-hash, so for each address-hash all fallback nodes precede compact; each sublist + /// is already sorted by address-hash → path and the path encodings preserve that order, so the + /// merged stream is strictly ascending. + /// + private static void WriteStorageNodes( + ref SortedTableBuilder table, Snapshot snapshot, + NativeMemoryList<(ValueHash256 AddrHash, TreePath Path)> fallback, + NativeMemoryList<(ValueHash256 AddrHash, TreePath Path)> compact, + BlobArenaWriter blobWriter, BloomFilter bloom) where TWriter : IByteBufferWriter + { + Span keyBuf = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + Span nrBuf = stackalloc byte[NodeRef.Size]; + // Cache the materialised Hash256 across a per-addressHash run — the merge keeps all of an + // address-hash's nodes (across sublists) contiguous, so one Gen0 alloc per address-hash. + ValueHash256 cachedHash = default; + Hash256? cachedRef = null; + int fi = 0, ci = 0; + while (true) + { + bool hasF = fi < fallback.Count, hasC = ci < compact.Count; + if (!hasF && !hasC) break; + + // Smallest head by (addressHash, sub-rank fallback + a.Bytes[..PersistedSnapshotKey.AddressHashPrefixLength] + .SequenceCompareTo(b.Bytes[..PersistedSnapshotKey.AddressHashPrefixLength]) < 0; + + /// Emit the single referenced blob-arena id record (column 0x00, sorts first). A base + /// snapshot writes all its trie RLP through one blob arena, so there is exactly one. + private static void WriteRefId(ref SortedTableBuilder table, BlobArenaWriter blobWriter) + where TWriter : IByteBufferWriter + { + Span refIdKey = stackalloc byte[PersistedSnapshotKey.RefIdKeyLength]; + int refIdLen = PersistedSnapshotKey.WriteRefIdKey(refIdKey, blobWriter.BlobArenaId); + table.Add(refIdKey[..refIdLen], PersistedSnapshotTags.RefIdValue); + } + + private static void WriteMetadata( + ref SortedTableBuilder table, Snapshot snapshot, BlobArenaWriter blobWriter) where TWriter : IByteBufferWriter + { + // blob_range is this base snapshot's contiguous trie-RLP run in the single blob arena it + // targeted — every trie node above wrote through this same blobWriter, so the run is final. + BlobRange blobRange = blobWriter.Written > blobWriter.StartOffset + ? new BlobRange(blobWriter.BlobArenaId, blobWriter.StartOffset, blobWriter.Written - blobWriter.StartOffset) + : BlobRange.None; + + Span keyBuf = stackalloc byte[1 + PersistedSnapshotTags.MetadataKeyLength]; + Span blockNumBytes = stackalloc byte[8]; + Span blobRangeBytes = stackalloc byte[BlobRange.SerializedSize]; + + blobRange.Write(blobRangeBytes); + AddMetadata(ref table, keyBuf, PersistedSnapshotTags.MetadataBlobRangeKey, blobRangeBytes); + + BitConverter.TryWriteBytes(blockNumBytes, snapshot.From.BlockNumber); + AddMetadata(ref table, keyBuf, PersistedSnapshotTags.MetadataFromBlockKey, blockNumBytes); + AddMetadata(ref table, keyBuf, PersistedSnapshotTags.MetadataFromHashKey, snapshot.From.StateRoot.Bytes); + + // The ref-id record (column 0x00) sorts before everything and is emitted up front by WriteRefId. + BitConverter.TryWriteBytes(blockNumBytes, snapshot.To.BlockNumber); + AddMetadata(ref table, keyBuf, PersistedSnapshotTags.MetadataToBlockKey, blockNumBytes); + AddMetadata(ref table, keyBuf, PersistedSnapshotTags.MetadataToHashKey, snapshot.To.StateRoot.Bytes); + + AddMetadata(ref table, keyBuf, PersistedSnapshotTags.MetadataVersionKey, PersistedSnapshotTags.MetadataFormatVersion); + } + + private static void AddMetadata(ref SortedTableBuilder table, scoped Span keyBuf, + scoped ReadOnlySpan name, scoped ReadOnlySpan value) where TWriter : IByteBufferWriter + { + int len = PersistedSnapshotKey.WriteMetadataKey(keyBuf, name); + table.Add(keyBuf[..len], value); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotColumnBounds.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotColumnBounds.cs new file mode 100644 index 000000000000..cea79aef5623 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotColumnBounds.cs @@ -0,0 +1,91 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Per-column data-block bounds of a persisted snapshot's , precalculated once +/// when the snapshot is opened. Because keys sort by column tag first, every column occupies a contiguous +/// run of data blocks; caching each column's run lets a point lookup clamp the stage-1 index search to +/// that run (see ) instead of binary-searching the whole +/// index, and reuse the already-read footer instead of re-reading it per lookup. +/// +/// +/// [i] is the first data block whose separator is ≥ [i] +/// (i.e. the first block that may hold a key of that column), with +/// as the "past the end" sentinel for an absent column. The tags are ascending, so the starts are +/// non-decreasing and a real column's range is [start[i], start[i+1]] — the upper edge is the next +/// boundary's first block inclusive, because a single data block can straddle two columns. An +/// over-wide range stays correct; it only narrows the search a little less. +/// +internal readonly struct PersistedSnapshotColumnBounds +{ + // Ascending boundary tags. Real point-lookup columns are Storage (0xF9) .. Account (0xFE); RefId + // (0x00) and Metadata (0xFF) bracket them so every real column has a next-boundary upper edge. + private static ReadOnlySpan BoundaryTags => + [ + PersistedSnapshotKey.RefIdColumn, + PersistedSnapshotKey.StorageColumn, + PersistedSnapshotKey.StateFallbackColumn, + PersistedSnapshotKey.StateCompactColumn, + PersistedSnapshotKey.StateTopColumn, + PersistedSnapshotKey.SlotColumn, + PersistedSnapshotKey.AccountColumn, + PersistedSnapshotKey.MetadataColumn, + ]; + + private readonly SortedTable.Footer _footer; + private readonly long[]? _startBlock; + + private PersistedSnapshotColumnBounds(in SortedTable.Footer footer, long[] startBlock) + { + _footer = footer; + _startBlock = startBlock; + } + + /// Whether geometry was resolved; false for an empty / unreadable table, in which + /// case callers fall back to a plain . + public bool IsValid => _startBlock is not null; + + /// The footer read while precalculating the bounds; reused by every clamped seek. + public SortedTable.Footer Footer => _footer; + + /// Read the footer once and resolve each column boundary's first data block. + public static PersistedSnapshotColumnBounds Compute(scoped in TReader reader, Bound table) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + if (!SortedTable.TryReadFooter(in reader, table, out SortedTable.Footer footer) + || footer.NumDataBlocks == 0) + return default; + + ReadOnlySpan tags = BoundaryTags; + long[] startBlock = new long[tags.Length]; + Span key = stackalloc byte[1]; + for (int i = 0; i < tags.Length; i++) + { + key[0] = tags[i]; + startBlock[i] = SortedTableReader.TryFindStartBlock(in reader, table, in footer, key, out long b) + ? b + : footer.NumDataBlocks; // no key ≥ tag: sentinel past the last block + } + return new PersistedSnapshotColumnBounds(in footer, startBlock); + } + + /// + /// Inclusive data-block range covering point-lookup column (Storage + /// 0xF9 .. Account 0xFE). Only valid when . + /// + public void GetColumnRange(byte columnTag, out long loBlock, out long hiBlock) + { + long last = _footer.NumDataBlocks - 1; + // Map the ascending boundary tags to _startBlock indices: 0xF9→1 … 0xFE→6, so the next boundary + // is idx+1 (0xFE→Metadata at 7). RefId (0x00, idx 0) is never a point-lookup column. + int idx = columnTag - (PersistedSnapshotKey.StorageColumn - 1); + loBlock = Math.Clamp(_startBlock![idx], 0, last); + hiBlock = Math.Clamp(_startBlock[idx + 1], loBlock, last); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotCompactor.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotCompactor.cs new file mode 100644 index 000000000000..02c867516cd1 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotCompactor.cs @@ -0,0 +1,407 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Diagnostics; +using System.Numerics; +using System.Threading.Channels; +using Nethermind.Config; +using Nethermind.Core; +using Nethermind.Core.Collections; +using Nethermind.Db; +using Nethermind.Logging; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using Nethermind.State.Flat.PersistedSnapshots.Storage; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Logarithmic compaction for the persisted snapshots, bounded above by the +/// PersistedSnapshotMaxCompactSize ceiling. A single instance is wired over the +/// repository. compacts a block's natural power-of-2 window — +/// the sub-CompactSize intermediates and the >CompactSize merges; +/// produces the CompactSize-wide +/// CompactSized snapshot. Each window merges every persisted snapshot assembled within it into +/// one compacted snapshot when at least two are available — the window need not be fully +/// populated. +/// +/// +/// Takes a dependency on purely to order shutdown: the +/// edge makes DI activate the loader first and so dispose this compactor before it, draining the +/// bucket-touching worker tasks (via ) before the loader's +/// Dispose runs . Without it +/// a worker could index a new persisted snapshot after the tier is marked, losing its files. +/// +public class PersistedSnapshotCompactor( + ISnapshotRepository snapshotRepository, + IArenaManager arenaManager, + BlobArenaManager blobs, + ISnapshotCatalog catalog, + IFlatDbConfig config, + ICompactionSchedule schedule, + IPersistedSnapshotLoader loader, + IProcessExitSource processExitSource, + ILogManager logManager) : IPersistedSnapshotCompactor +{ + // Held only to anchor the disposal order documented above (loader disposed after this). + private readonly IPersistedSnapshotLoader _disposeOrderingAnchor = loader; + private readonly ILogger _logger = logManager.GetClassLogger(); + private readonly ISnapshotCatalog _catalog = catalog; + private readonly ICompactionSchedule _schedule = schedule; + private readonly bool _validatePersistedSnapshot = config.ValidatePersistedSnapshot; + private readonly double _bloomBitsPerKey = config.PersistedSnapshotBloomBitsPerKey; + + private readonly Channel<(ArrayPoolList Batch, long PersistedBlockNumber)> _compactPersistedJobs = Channel.CreateBounded<(ArrayPoolList, long)>(16); + private readonly Channel<(StateId Boundary, long PersistedBlockNumber)> _boundaryCompactJobs = Channel.CreateBounded<(StateId, long)>(16); + // Background workers and their in-flight compaction observe process-exit directly; graceful + // disposal instead completes the channels and drains the remaining work (see DisposeAsync). + private readonly CancellationToken _shutdownToken = processExitSource.Token; + private Task? _compactPersistedTask; + private Task[]? _boundaryCompactorTasks; + private readonly Lock _startLock = new(); + private int _disposed; + + private const int BoundaryCompactorWorkerCount = 4; + + /// + public async ValueTask EnqueueAsync(ArrayPoolList batch, long persistedBlockNumber, CancellationToken cancellationToken) + { + // Fire-and-forget: EnsureStarted returns the long-running compactor task, which must not be awaited. + _ = EnsureStarted(); + try + { + // Awaits a free slot on the bounded queue, providing backpressure without blocking a thread; + // the caller's token releases the wait on shutdown. + await _compactPersistedJobs.Writer.WriteAsync((batch, persistedBlockNumber), cancellationToken); + } + catch (OperationCanceledException) + { + // The batch never entered the channel, so dispose the handoff we still own. + batch.Dispose(); + throw; + } + } + + private Task EnsureStarted() + { + // Guard against concurrent EnqueueAsync callers spawning duplicate worker sets. + lock (_startLock) + { + _compactPersistedTask ??= RunPersistedCompactor(_shutdownToken); + if (_boundaryCompactorTasks is null) + { + Task[] tasks = new Task[BoundaryCompactorWorkerCount]; + for (int i = 0; i < BoundaryCompactorWorkerCount; i++) + tasks[i] = RunBoundaryCompactor(_shutdownToken); + _boundaryCompactorTasks = tasks; + } + return _compactPersistedTask; + } + } + + private async Task RunPersistedCompactor(CancellationToken cancellationToken) + { + try + { + await foreach ((ArrayPoolList batch, long persistedBlockNumber) in _compactPersistedJobs.Reader.ReadAllAsync(cancellationToken)) + { + try + { + await ProcessCompactBatch(batch, persistedBlockNumber, cancellationToken); + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + _logger.Error($"Error compacting persisted snapshot batch. {ex}"); + } + finally + { + batch.Dispose(); + } + } + } + catch (OperationCanceledException) + { + while (_compactPersistedJobs.Reader.TryRead(out (ArrayPoolList Batch, long PersistedBlockNumber) item)) + item.Batch.Dispose(); + } + } + + private async Task ProcessCompactBatch(ArrayPoolList batch, long persistedBlockNumber, CancellationToken cancellationToken) + { + if (batch.Count == 0) return; + + using ArrayPoolList largeBoundaries = new(batch.Count); + using ArrayPoolList compactSizeBoundaries = new(batch.Count); + SortedDictionary> buckets = []; + for (int i = 0; i < batch.Count; i++) + { + StateId s = batch[i]; + long b = s.BlockNumber; + if (b == 0) continue; + + if (_schedule.IsLargeCompactionBoundary(b)) + { + // Large boundary: needs the CompactSized snapshot AND the >CompactSize merge. + largeBoundaries.Add(s); + compactSizeBoundaries.Add(s); + } + else if (_schedule.IsCompactSizeBoundary(b)) + { + // Plain CompactSize boundary: only the CompactSized. + compactSizeBoundaries.Add(s); + } + else + { + // Non-boundary: bucket by power-of-2 alignment (always < CompactSize). + int compactSize = (int)_schedule.GetPersistedSnapshotCompactSize(b); + if (!buckets.TryGetValue(compactSize, out List? bucket)) + buckets[compactSize] = bucket = []; + bucket.Add(s); + } + } + + // Ascending bucket order: each sub-CompactSize layer's inputs (the previous layer's + // outputs) exist before it runs. + foreach (KeyValuePair> kv in buckets) + Parallel.ForEach(kv.Value, new ParallelOptions { CancellationToken = cancellationToken }, state => DoCompactSnapshot(state, persistedBlockNumber)); + + // Every boundary — CompactSize and large alike — lands on a CompactSize multiple, so each + // needs its CompactSized snapshot for RocksDB (persistence advances one CompactSize + // per step); both kinds are collected in compactSizeBoundaries above. + foreach (StateId boundary in compactSizeBoundaries) + DoCompactCompactSized(boundary); + + // Large boundaries additionally carry a >CompactSize merge. These can be a few GB large, so + // they are handed to the boundary compactor to run as a separate background task rather than + // blocking this batch worker. + foreach (StateId boundary in largeBoundaries) + await _boundaryCompactJobs.Writer.WriteAsync((boundary, persistedBlockNumber), cancellationToken); + } + + private async Task RunBoundaryCompactor(CancellationToken cancellationToken) + { + try + { + await foreach ((StateId state, long persistedBlockNumber) in _boundaryCompactJobs.Reader.ReadAllAsync(cancellationToken)) + { + try + { + // Only large boundaries reach this channel; their CompactSized was already + // produced in ProcessCompactBatch, so DoCompactSnapshot here does the + // >CompactSize merge. + DoCompactSnapshot(state, persistedBlockNumber); + } + catch (Exception ex) + { + _logger.Error($"Error compacting boundary persisted snapshot {state}. {ex}"); + } + } + } + catch (OperationCanceledException) { } + } + + public async ValueTask DisposeAsync() + { + if (Interlocked.Exchange(ref _disposed, 1) != 0) return; + // Complete and drain the persisted stage first so any boundary jobs it produces are written + // before the boundary channel is completed; on process exit the shared token has already + // cancelled both stages, so these awaits return promptly instead of draining. + _compactPersistedJobs.Writer.Complete(); + if (_compactPersistedTask is not null) + await _compactPersistedTask; + _boundaryCompactJobs.Writer.Complete(); + if (_boundaryCompactorTasks is not null) + await Task.WhenAll(_boundaryCompactorTasks); + } + + /// + /// Compact the persisted snapshots ending at over the block's + /// natural power-of-2 window. Produces sub-CompactSize intermediates and the + /// >CompactSize merges; the CompactSize-wide window is + /// reserved for . Invoked by the background batch worker + /// (see ); not part of . + /// + /// + /// Does nothing when the block's window is a single snapshot (nothing to merge). The + /// CompactSize-wide window is produced by ; + /// routes those boundaries away from here, so this method + /// only ever sees sub-CompactSize intermediates and >CompactSize merges. + /// + public void DoCompactSnapshot(StateId snapshotTo, long persistedBlockNumber = 0) + { + long blockNumber = snapshotTo.BlockNumber; + int size = (int)_schedule.GetPersistedSnapshotCompactSize(blockNumber); + // size 1 is a single snapshot — nothing to merge. + if (size <= 1) return; + if (snapshotRepository.PersistedSnapshotCount < 2) return; + + // Window left edge is the raw block number (blockNumber - size); the alignment lives in + // offset-shifted space, so ((blockNumber-1)/size)*size would only be correct at offset 0. + // Clamped to the persistence point: snapshots below the persisted block are already in RocksDB, + // so merging them is wasted work. The clamp also makes the assemble walk reject a below-persistence + // large-compacted skip-pointer (whose To is above the persisted block but whose From is below it) + // and instead assemble from the persisted block upward via narrower edges. A no-op when + // persistedBlockNumber <= blockNumber - size. + long startingBlockNumber = Math.Max(blockNumber - size, persistedBlockNumber); + CompactRange(snapshotTo, startingBlockNumber, size, isCompactSized: false); + } + + /// + /// Produce the CompactSize-wide snapshot ending at the boundary + /// block — the snapshot PersistenceManager writes to + /// RocksDB. Invoked by the background batch worker (see ); not part of + /// . + /// + public void DoCompactCompactSized(StateId snapshotTo) + { + long blockNumber = snapshotTo.BlockNumber; + if (!_schedule.IsCompactSizeBoundary(blockNumber) && !_schedule.IsLargeCompactionBoundary(blockNumber)) return; + + if (snapshotRepository.PersistedSnapshotCount < 2) return; + + // The CompactSized snapshot is always CompactSize-wide; GetCompactSize returns exactly CompactSize at + // any boundary (it caps there), so the window is (blockNumber - CompactSize, blockNumber]. No + // persistence clamp: this CompactSize-wide window lands on a persistence boundary and never dips + // below the persisted block. + int compactSize = _schedule.GetCompactSize(blockNumber); + CompactRange(snapshotTo, blockNumber - compactSize, compactSize, isCompactSized: true); + } + + private bool CompactRange(StateId snapshotTo, long startingBlockNumber, int compactSize, bool isCompactSized) + { + using PersistedSnapshotList snapshots = snapshotRepository.AssemblePersistedSnapshotsForCompaction(snapshotTo, startingBlockNumber); + if (snapshots.Count < 2) return false; + + if (_logger.IsDebug) _logger.Debug($"Compacting {snapshots.Count} persisted snapshots at block {snapshotTo.BlockNumber}, compact size {compactSize}, CompactSized {isCompactSized}"); + + StateId from = snapshots[0].From; + StateId to = snapshots[^1].To; + + // Open one WholeReadSession per source for the whole compaction. Every column + // helper inside NWayMergeSnapshots reads through these views — one mmap + + // MADV_NORMAL on open and one MADV_DONTNEED on close per source, regardless of + // how many columns we walk. The session-dispose MADV_DONTNEED drops the source's + // page cache. The ref_ids union is computed inside the merger directly from each + // source's metadata value span — no pre-pass on this side. + int n = snapshots.Count; + using ArrayPoolList sessionsList = new(n, n); + try + { + long estimatedSize = 0; + long bloomCapacity = 0; + // A large compaction adopts one bloom across the snapshots it contains, so the assembled + // sources can share a single filter that already reports the whole window's key count. + // Dedup by owner so a shared bloom is counted once instead of once per source — otherwise + // bloomCapacity (and the merged filter) is inflated by the number of sharers. + HashSet countedBlooms = []; + for (int i = 0; i < n; i++) + { + // Session dispose madvises the source's mmap range cold — the compacted + // snapshot that supersedes these sources warms its own cache lazily on the + // first read of each address, so there's no value in keeping these pages. + sessionsList[i] = snapshots[i].BeginWholeReadSession(); + + estimatedSize += snapshots[i].Size; + // Each source carries its own bloom; sum their key counts to size the merge. + // The AlwaysTrue placeholder reports Count == 0, so a not-yet-built source just + // contributes nothing — same as the old manager's sentinel did. + if (countedBlooms.Add(snapshots[i].BloomRef)) + bloomCapacity += snapshots[i].Bloom.Count; + } + + // Bloom-disabled or empty-capacity case uses an AlwaysTrue sentinel so the + // downstream AddCompactedSnapshot receives a non-null bloom uniformly. + BloomFilter mergedBloom = _bloomBitsPerKey > 0 && bloomCapacity > 0 + ? new BloomFilter(bloomCapacity, _bloomBitsPerKey) + : BloomFilter.AlwaysTrue(); + // A non-CompactSized merge at a large-compaction boundary spans >CompactSize — its own tier + // so the assemble walk can prefer it as the widest skip-pointer. Computed up front so the + // sub-CompactSize tier (PersistedSmallCompacted) lands in the separate small-arena pool. + SnapshotTier tier = isCompactSized + ? SnapshotTier.PersistedCompactSized + : _schedule.IsLargeCompactionBoundary(snapshotTo.BlockNumber) + ? SnapshotTier.PersistedLargeCompacted + : SnapshotTier.PersistedSmallCompacted; + + SnapshotLocation location; + ArenaReservation reservation; + using (ArenaWriter arenaWriter = arenaManager.CreateWriter(estimatedSize, small: tier == SnapshotTier.PersistedSmallCompacted)) + { + long sw = Stopwatch.GetTimestamp(); + PersistedSnapshotMerger.NWayMergeSnapshots( + sessionsList.AsSpan(), ref arenaWriter.GetWriter(), mergedBloom); + + long len = arenaWriter.GetWriter().Written; + // The assembled window is best-effort and may fall short of compactSize, so label by the + // actual compacted block span rounded up to the next power of two, not the target size. + int actualSize = (int)BitOperations.RoundUpToPowerOf2((ulong)(to.BlockNumber - from.BlockNumber)); + CompactSizeLabel sizeLabel = new(actualSize); + Metrics.PersistedSnapshotCompactedSize.Observe(len, sizeLabel); + Metrics.PersistedSnapshotCompactTime.Observe(Stopwatch.GetTimestamp() - sw, sizeLabel); + + (location, reservation) = arenaWriter.Complete(); + } + + // Durability barrier — fsync the metadata arena before the catalog records the + // compacted entry. No blob fsync here: compaction does not write new blobs, it + // only emits NodeRefs into existing base blob arenas (those were fsynced when + // their respective base snapshots were converted). + reservation.Fsync(); + + _catalog.Add(new CatalogEntry(from, to, location, tier)); + using (PersistedSnapshot compacted = new(from, to, reservation, blobs, tier, new RefCountedBloomFilter(mergedBloom))) + { + reservation.Dispose(); + snapshotRepository.AddPersistedSnapshot(compacted, tier); + if (!_schedule.IsCompactSizeBoundary(snapshotTo.BlockNumber) && !_schedule.IsLargeCompactionBoundary(snapshotTo.BlockNumber)) + { + // Sub-CompactSize intermediate. The bundle priority means this is never queried + // unless there's a deep reorg, so drop its freshly-written pages from the cache + + // tracker; they would otherwise sit hot until the snapshot is pruned. + compacted.Demote(); + } + else + { + WarmAddressColumnIndex(compacted); + // A >CompactSize merge spans (from, to] on the canonical chain, so its bloom is a + // superset pre-filter for every persisted snapshot fully contained there. Adopt it + // across all of them — each then shares one bloom and frees its own (multi-MiB) + // filter, while still pre-filtering (unlike the AlwaysTrue demote sentinel). + if (_schedule.IsLargeCompactionBoundary(snapshotTo.BlockNumber)) + snapshotRepository.ShareBloomAcrossRange(from, to, compacted.BloomRef, blobs); + } + } + + Metrics.PersistedSnapshotCompactions++; + return true; + } + finally + { + for (int i = 0; i < n; i++) sessionsList[i]?.Dispose(); + } + } + + /// + /// Pre-fault the sorted table's index block + footer (the tail of a freshly-written large-tier + /// snapshot) so it lands in the page-residency tracker. Without this, the first lookups take a + /// chain of inline minor page faults walking the index. + /// + internal static void WarmAddressColumnIndex(PersistedSnapshot snapshot) + { + ArenaReservation reservation = snapshot.Reservation; + ArenaByteReader reader = reservation.CreateReader(); + Bound table = new(0, reader.Length); + if (!SortedTable.TryReadFooter(in reader, table, out SortedTable.Footer footer) + || footer.NumDataBlocks == 0) + return; + + // The reader is reservation-relative and TouchRangePopulate takes reservation-relative offsets. + // The index block starts at the footer's recorded offset (just past the last, unpadded, data + // block) and runs, with the footer, to the table end. + long indexStart = SortedTable.IndexBlockStart(table, footer); + long indexLen = table.Length - indexStart; + if (indexLen <= 0) return; + reservation.TouchRangePopulate(indexStart, indexLen); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotKey.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotKey.cs new file mode 100644 index 000000000000..fdeeeb122e14 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotKey.cs @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers.Binary; +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Trie; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Materializes the fully-verbose, single-level sorted-table keys for a persisted snapshot and +/// classifies them on read. The on-disk table is a plain ascending byte-sorted map (see +/// ); to reproduce the reverse-tag emission order that the columnar +/// builder/compacter use (outer columns and per-entity sub-tags descend, entity bytes ascend), the +/// column and subcolumn tag bytes are stored as 255 − tag. Everything else is natural. +/// +/// +/// Key shapes (tag bytes shown as their stored 255 − tag value): +/// +/// Storage node : F9 + addrHash(20) + {FE compact | FD fallback} + path +/// State node : {FC top | FB compact | FA fallback} + path +/// Slot : FD + addr(20) + slot(32 BE) +/// Per-address : FE + addr(20) value = RLP [account, selfdestruct] +/// Metadata : FF + name(10, NUL-padded) +/// +/// Slots have their own top-level column (FD) that sorts just before the per-address account +/// column (FE); ascending byte order over these is exactly the columnar leaf-emission order. +/// +internal static class PersistedSnapshotKey +{ + // Referenced blob-arena ids: one record per id, keyed by this column (0x00) + the id. 0x00 is + // below every real column (0xF9..0xFF), so ref-id records sort first and iterate cheaply from + // the table start; the value is a presence marker (PersistedSnapshotTags.RefIdValue). + internal const byte RefIdColumn = 0x00; + internal const int RefIdKeyLength = 1 + sizeof(ushort); + + // Column tag bytes = 255 - PersistedSnapshotTags column tag. + internal const byte MetadataColumn = 0xFF; // 255 - 0x00 + internal const byte AccountColumn = 0xFE; // 255 - 0x01 (per-address: account + self-destruct) + internal const byte SlotColumn = 0xFD; // 255 - 0x02 + internal const byte StateTopColumn = 0xFC; // 255 - 0x03 + internal const byte StateCompactColumn = 0xFB; // 255 - 0x04 + internal const byte StateFallbackColumn = 0xFA; // 255 - 0x05 + internal const byte StorageColumn = 0xF9; // 255 - 0x06 + + // Storage-trie subcolumn bytes = 255 - storage sub-tag. Storage has no top tier (it matches the + // persistence layout): paths 0-15 use the compact (8-byte) encoding, 16+ use the fallback. + internal const byte StorageCompactSub = 0xFE; // 255 - 0x01 + internal const byte StorageFallbackSub = 0xFD; // 255 - 0x02 + + // State top tier is 3-byte (path length 0-5), matching BaseTriePersistence's StateNodesTop column. + private const int StateTopPathThreshold = 5; + private const int CompactPathThreshold = 15; + + internal const int AddressKeyLength = Address.Size; // 20 + internal const int AddressHashPrefixLength = PersistedSnapshotTags.AddressHashPrefixLength; // 20 + internal const int SlotLength = 32; + + /// Largest materialized key: storage fallback = 1 + 20 + 1 + 33. + internal const int MaxKeyLength = 1 + AddressHashPrefixLength + 1 + 33; + + internal static int WriteMetadataKey(Span dst, scoped ReadOnlySpan name) + { + dst[0] = MetadataColumn; + name.CopyTo(dst[1..]); + return 1 + name.Length; + } + + /// Materialize a referenced blob-arena id record key: + the + /// id (big-endian, so ids sort numerically). + internal static int WriteRefIdKey(Span dst, ushort blobArenaId) + { + dst[0] = RefIdColumn; + BinaryPrimitives.WriteUInt16BigEndian(dst[1..], blobArenaId); + return RefIdKeyLength; + } + + internal static ushort ReadRefId(scoped ReadOnlySpan key) => BinaryPrimitives.ReadUInt16BigEndian(key[1..]); + + internal static int WriteAccountKey(Span dst, scoped ReadOnlySpan address) + { + dst[0] = AccountColumn; + address.CopyTo(dst[1..]); + return 1 + AddressKeyLength; + } + + internal static int WriteSlotKey(Span dst, scoped ReadOnlySpan address, scoped ReadOnlySpan slot32) + { + dst[0] = SlotColumn; + address.CopyTo(dst[1..]); + slot32.CopyTo(dst[(1 + AddressKeyLength)..]); + return 1 + AddressKeyLength + SlotLength; + } + + internal static int WriteStateNodeKey(Span dst, scoped in TreePath path) + { + if (path.Length <= StateTopPathThreshold) + { + dst[0] = StateTopColumn; + path.EncodeWith3Byte(dst.Slice(1, 3)); + return 4; + } + if (path.Length <= CompactPathThreshold) + { + dst[0] = StateCompactColumn; + path.EncodeWith8Byte(dst.Slice(1, 8)); + return 9; + } + dst[0] = StateFallbackColumn; + path.Path.Bytes.CopyTo(dst[1..]); + dst[33] = (byte)path.Length; + return 34; + } + + internal static int WriteStorageNodeKey(Span dst, scoped ReadOnlySpan addressHash, scoped in TreePath path) + { + dst[0] = StorageColumn; + addressHash[..AddressHashPrefixLength].CopyTo(dst[1..]); + int pathStart = 2 + AddressHashPrefixLength; + if (path.Length <= CompactPathThreshold) + { + dst[1 + AddressHashPrefixLength] = StorageCompactSub; + path.EncodeWith8Byte(dst.Slice(pathStart, 8)); + return pathStart + 8; + } + dst[1 + AddressHashPrefixLength] = StorageFallbackSub; + path.Path.Bytes.CopyTo(dst[pathStart..]); + dst[pathStart + 32] = (byte)path.Length; + return pathStart + 33; + } + + // ---- read-side classification helpers (operate on a materialized key span) ---- + + internal static ReadOnlySpan PerAddressAddress(ReadOnlySpan key) => + key.Slice(1, AddressKeyLength); + + internal static ReadOnlySpan SlotColumnAddress(ReadOnlySpan key) => + key.Slice(1, AddressKeyLength); + + internal static ReadOnlySpan SlotColumnSlot(ReadOnlySpan key) => + key.Slice(1 + AddressKeyLength, SlotLength); + + internal static ReadOnlySpan StorageAddressHash(ReadOnlySpan key) => + key.Slice(1, AddressHashPrefixLength); + + internal static byte StorageSubColumn(scoped ReadOnlySpan key) => key[1 + AddressHashPrefixLength]; + + internal static ReadOnlySpan StoragePathBytes(ReadOnlySpan key) => + key[(2 + AddressHashPrefixLength)..]; + + internal static ReadOnlySpan StatePathBytes(ReadOnlySpan key) => key[1..]; + + /// Decode a state/storage path key, given its column or subcolumn-derived stage + /// (0 = state top/3-byte, 1 = compact/8-byte, else fallback/33-byte). Storage never uses stage 0. + internal static TreePath DecodePath(scoped ReadOnlySpan encoded, int stage) => stage switch + { + 0 => TreePath.DecodeWith3Byte(encoded), + 1 => TreePath.DecodeWith8Byte(encoded), + _ => new TreePath(new ValueHash256(encoded[..32]), encoded[32]), + }; +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotList.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotList.cs new file mode 100644 index 000000000000..7d882498a7cf --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotList.cs @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Collections; +using Nethermind.Core.Collections; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// A list of persisted snapshots ordered oldest-first (index 0 = oldest). +/// Probe logic lives in . +/// +public sealed class PersistedSnapshotList : IDisposable, IEnumerable +{ + private readonly ArrayPoolList _list; + + public PersistedSnapshotList(int initial) => _list = new ArrayPoolList(initial); + + private PersistedSnapshotList(ArrayPoolList list) => _list = list; + + public int Count => _list.Count; + + public PersistedSnapshot this[int index] => _list[index]; + public PersistedSnapshot this[Index index] => _list[index]; + + public void Add(PersistedSnapshot snapshot) => _list.Add(snapshot); + + public void Reverse() => _list.Reverse(); + + public static PersistedSnapshotList Empty() => new(ArrayPoolList.Empty()); + + public IEnumerator GetEnumerator() => _list.GetEnumerator(); + + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + + public void Dispose() => _list.DisposeRecursive(); +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotLoader.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotLoader.cs new file mode 100644 index 000000000000..3baf207cd84a --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotLoader.cs @@ -0,0 +1,248 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Config; +using Nethermind.Core; +using Nethermind.Db; +using Nethermind.Logging; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Timer = System.Timers.Timer; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// +/// A registered singleton that depends on and the arena/blob/catalog +/// stores. Because it depends on the repository, DI disposes it before the repository; and because the +/// compactor depends on this loader, DI disposes the compactor (draining its bucket-touching workers) +/// before it — so tears the persisted tier down only after all such work has stopped. +/// +public sealed class PersistedSnapshotLoader( + ISnapshotRepository repository, + IArenaManager arena, + BlobArenaManager blobs, + ISnapshotCatalog catalog, + IFlatDbConfig config, + ILogManager logManager) : IPersistedSnapshotLoader +{ + // Below this many catalog entries / bloom picks we skip the progress logger and + // the heartbeat timer — the cost of one Parallel.ForEach over a tiny input is in + // the µs range, well below the bookkeeping overhead the logger adds per tick. + private const int ParallelLoadThreshold = 1024; + // Heartbeat for the progress logger inside the parallel sections. The logger + // itself dedups via state-change comparison, so sub-second ticks are cheap. + private const int ProgressLogIntervalMs = 1000; + + private readonly ISnapshotCatalog _catalog = catalog; + private readonly double _bloomBitsPerKey = config.PersistedSnapshotBloomBitsPerKey; + private readonly bool _validatePersistedSnapshot = config.ValidatePersistedSnapshot; + private readonly ILogger _logger = logManager.GetClassLogger(); + private int _disposed; + + private bool BloomEnabled => _bloomBitsPerKey > 0; + + /// + /// + /// Routes each catalog entry into its bucket by the stored (range alone + /// cannot tell a base from a sub-CompactSize compacted snapshot apart). For catalogs above + /// entries, the per-entry arena/blob lease work runs on + /// with a heartbeat ; each entry is then + /// indexed under its bucket's lock via . + /// + public void Load() + { + // Blob arena pool first — rehydrates file lengths so the PersistedSnapshot ctor's TryLeaseFile + // calls (driven by each snapshot's ref_ids metadata) can resolve the ids. Whole-file + // reservations are created lazily on first lease. + blobs.Initialize(); + + // Can be millions of entries on a long-running node — materialised once and shared by the + // arena init and the parallel load below. + List entries = [.. _catalog.Load()]; + arena.Initialize(entries); + + LoadSnapshotsParallel(entries); + + // Delete any blob arena file no loaded snapshot referenced — recoverable + // orphans from a mid-write crash. + blobs.SweepUnreferenced(); + + ReconstructBloom(entries); + } + + private void LoadSnapshotsParallel(List entries) + { + ProgressLogger? loadLog = null; + Timer? heartbeat = null; + if (entries.Count > ParallelLoadThreshold && _logger.IsInfo) + { + loadLog = new ProgressLogger("Persisted snapshot load", logManager); + loadLog.Reset(0, entries.Count); + heartbeat = new Timer(ProgressLogIntervalMs); + heartbeat.Elapsed += (_, _) => loadLog.LogProgress(); + heartbeat.Start(); + } + + try + { + long loaded = 0; + Parallel.ForEach(entries, entry => + { + LoadSnapshot(entry); + if (loadLog is not null) loadLog.Update(Interlocked.Increment(ref loaded)); + }); + loadLog?.LogProgress(); + } + finally + { + heartbeat?.Dispose(); + } + } + + /// + /// Loads a single catalog entry's snapshot via , + /// which indexes it under the bucket's lock — so this is safe to run from the parallel load. + /// No catalog write: the entry is already in the catalog (we are reading from it). + /// + private void LoadSnapshot(CatalogEntry entry) + { + ArenaReservation reservation = arena.Open(entry.Location); + + // The ctor walks its own ref_ids metadata and leases each blob arena file (rolling back on + // partial failure) and takes its own lease on the reservation, so we drop ours right after. + // The bloom is the AlwaysTrue placeholder — ReconstructBloom replaces this snapshot with one + // carrying the real bloom once every snapshot is in place. The `using` drops the construction + // lease at the end; the bucket keeps its own. + using PersistedSnapshot snapshot = new(entry.From, entry.To, reservation, blobs, entry.Tier, RefCountedBloomFilter.AlwaysTrue()); + reservation.Dispose(); + repository.AddPersistedSnapshot(snapshot, entry.Tier); + } + + /// + /// Rebuild a bloom only for each widest snapshot covering the persisted tier and share it across its + /// range, so the narrower contained snapshots adopt it instead of each carrying its own — mirroring + /// the runtime layout a large compaction leaves behind. Snapshots no widest one covers keep their + /// AlwaysTrue placeholder (correct — never a false negative — just unfiltered). + /// + /// + /// Assembles the widest-first chain via the main read-path + /// (its EdgePriority leads with the large skip-pointers), so the chain tiles + /// (committed, head] with the fewest, widest snapshots. The committed base it targets is the + /// oldest loaded snapshot's From. The few wide blooms are rebuilt in parallel; chain ranges are + /// disjoint, so the per-range calls don't collide. + /// + private void ReconstructBloom(List entries) + { + if (!BloomEnabled || entries.Count == 0) return; + if (repository.GetLastSnapshotId() is not StateId head) return; + + // The persisted tier sits on the committed base — the oldest loaded snapshot's From. + StateId committed = entries[0].From; + foreach (CatalogEntry e in entries) + if (e.From.BlockNumber < committed.BlockNumber) committed = e.From; + if (head == committed) return; + + // Widest-first chain from head down to the committed base; .InMemory is empty at reload. + int estimatedSize = (int)Math.Clamp(head.BlockNumber - committed.BlockNumber, 4, 4096); + AssembledSnapshotResult assembled = repository.AssembleSnapshots(head, committed, estimatedSize); + assembled.InMemory.Dispose(); + using PersistedSnapshotList widest = assembled.Persisted; + + // Build the (few, wide) blooms in parallel and share each across its range. A fresh bloom + // (refcount 1) is leased by each snapshot ShareBloomAcrossRange re-registers; the local lease is + // released on dispose, leaving the shared snapshots holding theirs. + Parallel.ForEach(widest, snap => + { + RefCountedBloomFilter bloom; + using (WholeReadSession session = snap.BeginWholeReadSession()) + bloom = new RefCountedBloomFilter(PersistedSnapshotBloomBuilder.Build(session, snap, _bloomBitsPerKey)); + using (bloom) + repository.ShareBloomAcrossRange(snap.From, snap.To, bloom, blobs); + }); + } + + /// + public void ConvertAndRegister(Snapshot snapshot) + { + // One unified bloom covering account/slot/SD keys + state-trie + storage-trie paths. + // Sized as the union of both expected key counts at the configured bits-per-key. + BloomFilter bloom; + if (BloomEnabled) + { + long capacity = (long)snapshot.AccountsCount + + snapshot.Content.SelfDestructedStorageAddresses.Count + + 2L * snapshot.StoragesCount + + snapshot.StateNodesCount + + snapshot.StorageNodesCount; + bloom = new BloomFilter(Math.Max(capacity, 1), _bloomBitsPerKey); + } + else + { + bloom = BloomFilter.AlwaysTrue(); + } + + long estimatedSize = PersistedSnapshotBuilder.EstimateSize(snapshot); + + SnapshotLocation location; + ArenaReservation reservation; + using BlobArenaWriter blobWriter = blobs.CreateWriter(estimatedSize); + // Base snapshots are always sub-CompactSize (single-block window) and read-cold after + // compaction — pack their metadata into the separate small-arena pool. + using (ArenaWriter arenaWriter = arena.CreateWriter(estimatedSize, small: true)) + { + PersistedSnapshotBuilder.Build( + snapshot, ref arenaWriter.GetWriter(), blobWriter, bloom); + Metrics.PersistedSnapshotSize.Observe(arenaWriter.GetWriter().Written); + (location, reservation) = arenaWriter.Complete(); + } + blobWriter.Complete(); + + // Durability barrier — fsync both the metadata arena and the blob arena before the + // catalog records the new entry. A crash between this point and the next persistence + // checkpoint would otherwise leave the catalog pointing at unsynced pages whose + // contents are not yet guaranteed to be on disk. + reservation.Fsync(); + blobWriter.Fsync(); + + if (_logger.IsDebug) _logger.Debug($"Persisted snapshot {snapshot.From.BlockNumber}->{snapshot.To.BlockNumber} to disk (arena {location.ArenaId}, {location.Size} bytes)"); + + // Build the persisted snapshot (its ctor takes its own reservation + blob leases, so we drop + // ours), record the catalog entry, then index it. AddPersistedSnapshot takes the bucket's own + // lease, so we drop this construction lease once indexing (and optional validation) is done. + PersistedSnapshot persisted = new(snapshot.From, snapshot.To, reservation, blobs, SnapshotTier.PersistedBase, new RefCountedBloomFilter(bloom)); + reservation.Dispose(); + _catalog.Add(new CatalogEntry(snapshot.From, snapshot.To, location, SnapshotTier.PersistedBase)); + repository.AddPersistedSnapshot(persisted, SnapshotTier.PersistedBase); + + if (_validatePersistedSnapshot) + { + try + { + PersistedSnapshotUtils.ValidatePersistedSnapshot(snapshot, persisted); + } + catch (InvalidOperationException ex) + { + // Validation runs on a background persistence thread; an unhandled throw here would either + // be swallowed (looking like a good run) or crash the process with a 128+ code that git + // bisect treats as "abort". Exit explicitly with a bisect-compatible "bad" code instead. + if (_logger.IsError) _logger.Error($"Persisted snapshot validation failed for range {snapshot.From.BlockNumber}..{snapshot.To.BlockNumber}. Exiting with code {ExitCodes.GeneralError} for git bisect compatibility.", ex); + Environment.Exit(ExitCodes.GeneralError); + } + } + + persisted.Dispose(); + } + + /// + /// Flags the persisted tier's files for shutdown preservation. This is the loader's only teardown + /// step; the container disposes the rest — the repository (tearing down its buckets) and then the + /// arena/blob managers it depends on. Because the loader depends on , + /// DI disposes the loader before the repository, so the mark always lands before the buckets are torn down. + /// + public void Dispose() + { + if (Interlocked.Exchange(ref _disposed, 1) != 0) return; + repository.MarkPersistedTierForShutdown(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotMerger.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotMerger.cs new file mode 100644 index 000000000000..091509671ca6 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotMerger.cs @@ -0,0 +1,411 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers; +using System.Runtime.InteropServices; +using Nethermind.Core; +using Nethermind.Serialization.Rlp; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using AccountState = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotPerAddress.AccountState; +using SelfDestructState = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotPerAddress.SelfDestructState; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// N-way merge of persisted snapshots into a single . Each input is a +/// single sorted run; the merge walks them in ascending key order, resolving collisions newest-wins +/// (newest = highest source index, inputs are oldest-first). All inputs are blob-backed +/// ( values), so trie-node values are copied verbatim and the merged snapshot +/// references the union of the inputs' blob arenas via the metadata ref_ids entry. +/// +/// +/// Generic over the byte-reader source so it isn't bound to a specific reader; each input is an +/// that mints a fresh reader on demand (production +/// drives it with ). The deliberately-unoptimized find-min is +/// O(N) per step. +/// +public static class PersistedSnapshotMerger +{ + // A resolved per-address self-destruct record. Barrier is the newest source index that destructed + // Address (-1 when the address carries a self-destruct record but was only ever "new"). Built up + // front (BuildSelfDestructBarriers) because slots sort before the account column these live in; + // membership means the address has a self-destruct (Barrier >= 0 → destructed, else new). + private readonly struct SelfDestructBarrier(ValueAddress address, int barrier) + { + public readonly ValueAddress Address = address; + public readonly int Barrier = barrier; + } + + /// + /// N-way merge of N persisted snapshots (oldest-first) into . Callers + /// own the source lifecycle: open one reader source per input up front, pass them here, dispose + /// after the merge returns. + /// + internal static void NWayMergeSnapshots( + ReadOnlySpan views, ref TWriter writer, BloomFilter bloom) + where TWriter : IByteBufferWriter + where TView : IByteReaderSource + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct + { + ArgumentNullException.ThrowIfNull(bloom); + + // The table is built by streaming in strictly ascending key order: entries (ref-ids 0x00 … + // per-address 0xFE) first via the N-way merge, then metadata (0xFF) last. + SortedTableBuilder table = new(ref writer); + try + { + MergeEntries(views, ref table, bloom); + MergeMetadata(views, ref table); + table.Build(); + } + finally + { + table.Dispose(); + } + } + + /// + /// Streaming N-way merge of every non-metadata entry. Per key the newest source wins, except two + /// per-address cases resolved against the up-front self-destruct barriers + /// (, built first because slots sort before the account + /// column): slots (column 0xFD) truncated by a later self-destruct are dropped, and the per-address + /// account-column value (0xFE) is rebuilt by + /// — newest non-Absent account paired with the merged self-destruct state. + /// + private static void MergeEntries( + ReadOnlySpan views, ref SortedTableBuilder table, BloomFilter bloom) + where TWriter : IByteBufferWriter + where TView : IByteReaderSource + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct + { + int n = views.Length; + + // Slots sort before the account column, so a slot's self-destruct barrier is not yet known when + // the slot column streams past. Resolve every barrier first (self-destructs are rare → small). + SelfDestructBarrier[] barriers = BuildSelfDestructBarriers(views); + // Separate monotonic cursors over the ascending barriers: slots are processed before accounts. + int slotBarrierIdx = 0; + int accountBarrierIdx = 0; + + SortedTableEnumerator[] enums = new SortedTableEnumerator[n]; + bool[] hasMore = new bool[n]; + for (int i = 0; i < n; i++) + { + TReader r = views[i].CreateReader(); + enums[i] = new SortedTableEnumerator(in r, new Bound(0, r.Length)); + hasMore[i] = enums[i].MoveNext(in r); + } + + Span minKey = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + // n is the number of merged inputs (small in practice); cap the stackalloc and fall back to + // the heap for an unusually large compaction batch to avoid a stack overflow. + Span matching = n <= 64 ? stackalloc int[64] : new int[n]; + + // Scratch buffer for the rebuilt per-address [account, selfdestruct] value (see EmitCombined). + byte[] rlpBuffer = ArrayPool.Shared.Rent(256); + RlpStream rlpStream = new(rlpBuffer); + try + { + while (true) + { + int minIdx = -1; + for (int i = 0; i < n; i++) + { + if (!hasMore[i]) continue; + if (minIdx < 0 || enums[i].CurrentKey.SequenceCompareTo(enums[minIdx].CurrentKey) < 0) + minIdx = i; + } + if (minIdx < 0) break; + + ReadOnlySpan minKeySrc = enums[minIdx].CurrentKey; + int keyLen = minKeySrc.Length; + minKeySrc.CopyTo(minKey); + ReadOnlySpan key = minKey[..keyLen]; + + // Metadata (column 0xFF) sorts last and is produced separately by MergeMetadata. + if (key[0] == PersistedSnapshotKey.MetadataColumn) break; + + int matchCount = 0; + for (int i = 0; i < n; i++) + if (hasMore[i] && enums[i].CurrentKey.SequenceEqual(key)) + matching[matchCount++] = i; + int newest = matching[matchCount - 1]; + + if (key[0] == PersistedSnapshotKey.SlotColumn) + { + // Drop slots truncated by a later self-destruct; emit the rest newest-wins. + if (!IsSlotTruncated(barriers, ref slotBarrierIdx, PersistedSnapshotKey.SlotColumnAddress(key), newest)) + EmitNewest(views, enums, ref table, bloom, key, newest); + } + else if (key[0] == PersistedSnapshotKey.AccountColumn) + { + SelfDestructState sd = LookupSelfDestruct(barriers, ref accountBarrierIdx, PersistedSnapshotKey.PerAddressAddress(key)); + EmitCombined(views, enums, ref table, bloom, key, matching[..matchCount], sd, rlpStream, rlpBuffer); + } + else // ref-id, or state / storage trie node + { + EmitNewest(views, enums, ref table, bloom, key, newest); + } + + for (int k = 0; k < matchCount; k++) + { + int i = matching[k]; + TReader r = views[i].CreateReader(); + hasMore[i] = enums[i].MoveNext(in r); + } + } + } + finally + { + ArrayPool.Shared.Return(rlpBuffer); + } + } + + /// + /// Resolve every self-destructing address's barrier by decoding the self-destruct item of each + /// source's per-address values, scanning only the account column (seeked via + /// so the rest of the table is skipped). Returns + /// the addresses that carry a self-destruct in any source, sorted ascending: Barrier is the + /// newest source that destructed (or -1 when the address was only ever "new"). Self-destructs are + /// rare, so the working set is small. + /// + private static SelfDestructBarrier[] BuildSelfDestructBarriers(ReadOnlySpan views) + where TView : IByteReaderSource + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct + { + Span accountColKey = stackalloc byte[1]; + accountColKey[0] = PersistedSnapshotKey.AccountColumn; + Span valueBuf = stackalloc byte[256]; + + List<(ValueAddress Addr, int Source, bool IsDestruct)> selfDestructs = []; + for (int i = 0; i < views.Length; i++) + { + TReader r = views[i].CreateReader(); + Bound table = new(0, r.Length); + if (!SortedTableReader.TryFindStartBlock(in r, table, accountColKey, out long startBlock)) + continue; + + SortedTableEnumerator e = new(in r, table, startBlock); + while (e.MoveNext(in r)) + { + ReadOnlySpan key = e.CurrentKey; + byte col = key[0]; + if (col < PersistedSnapshotKey.AccountColumn) continue; // trailing slots in the start block + if (col > PersistedSnapshotKey.AccountColumn) break; // past the account column + + int vlen = checked((int)e.CurrentValue.Length); + Span v = vlen <= 256 ? valueBuf[..vlen] : new byte[vlen]; + if (!r.TryRead(e.CurrentValue.Offset, v)) continue; + SelfDestructState sd = PersistedSnapshotPerAddress.DecodeSelfDestructState(v); + if (sd == SelfDestructState.None) continue; + selfDestructs.Add((new ValueAddress(PersistedSnapshotKey.PerAddressAddress(key)), i, sd == SelfDestructState.Destructed)); + } + } + + if (selfDestructs.Count == 0) return []; + + // Sort by (address asc, source asc) and reduce each address-run to one barrier whose Barrier is + // the newest destructing source (-1 if the address was only ever "new"). Operands are copied + // into locals before taking AsSpan: a span over a List-indexer temporary (ValueAddress.AsSpan + // uses Unsafe.AsRef on the struct's storage) can alias a reused stack slot, making SequenceEqual + // spuriously true. + selfDestructs.Sort(static (a, b) => + { + ValueAddress aa = a.Addr, bb = b.Addr; + int cmp = aa.AsSpan.SequenceCompareTo(bb.AsSpan); + return cmp != 0 ? cmp : a.Source.CompareTo(b.Source); + }); + + List barriers = []; + int runStart = 0; + for (int i = 0; i < selfDestructs.Count; i++) + { + ValueAddress cur = selfDestructs[i].Addr; + bool lastOfRun = i + 1 == selfDestructs.Count; + if (!lastOfRun) + { + ValueAddress next = selfDestructs[i + 1].Addr; + lastOfRun = !next.AsSpan.SequenceEqual(cur.AsSpan); + } + if (lastOfRun) + { + int barrier = -1; // sorted by source asc, so the last destruct in the run is the newest + for (int j = runStart; j <= i; j++) + if (selfDestructs[j].IsDestruct) barrier = selfDestructs[j].Source; + barriers.Add(new SelfDestructBarrier(cur, barrier)); + runStart = i + 1; + } + } + return [.. barriers]; + } + + /// + /// Whether a slot at whose newest contributing source is + /// is truncated by a later self-destruct. + /// is a monotonic cursor over the ascending , advanced in lockstep with + /// the ascending slot column. + /// + private static bool IsSlotTruncated(SelfDestructBarrier[] barriers, ref int barrierIdx, scoped ReadOnlySpan slotAddr, int newest) + { + while (barrierIdx < barriers.Length && barriers[barrierIdx].Address.AsSpan.SequenceCompareTo(slotAddr) < 0) + barrierIdx++; + return barrierIdx < barriers.Length + && barriers[barrierIdx].Address.AsSpan.SequenceEqual(slotAddr) + && newest < barriers[barrierIdx].Barrier; + } + + /// The merged self-destruct state for , read from the ascending + /// via the monotonic cursor (the account + /// column streams in the same ascending address order). Destructed when any source in the merged + /// range destructed (Barrier >= 0), New when the address carries only "new" records, else None. + private static SelfDestructState LookupSelfDestruct(SelfDestructBarrier[] barriers, ref int barrierIdx, scoped ReadOnlySpan addr) + { + while (barrierIdx < barriers.Length && barriers[barrierIdx].Address.AsSpan.SequenceCompareTo(addr) < 0) + barrierIdx++; + if (barrierIdx < barriers.Length && barriers[barrierIdx].Address.AsSpan.SequenceEqual(addr)) + return barriers[barrierIdx].Barrier >= 0 ? SelfDestructState.Destructed : SelfDestructState.New; + return SelfDestructState.None; + } + + /// + /// Rebuild and emit the per-address [account, selfdestruct] value. The account is taken from + /// the newest matching source whose account item is not , so an + /// older real account survives a newer self-destruct-only entry — this replicates the prior + /// separate account-key newest-wins. The self-destruct state is the merged . + /// + /// + /// Emitting Destructed whenever any source in the range destructed (even if a newer source + /// re-created the contract) is deliberate and matches the only consumer that reads the flag value, + /// : writing a CompactSized snapshot to RocksDB does + /// if (SelfDestructFlag is false) batch.SelfDestruct(addr) before re-applying the account and + /// the (already barrier-filtered) post-destruct slots, clearing any storage carried from before the + /// range so a re-created contract ends with exactly its new slots. Emitting New there would skip the + /// clear and leak pre-destruct storage. The read path otherwise keys off presence + /// (). + /// + private static void EmitCombined( + ReadOnlySpan views, SortedTableEnumerator[] enums, + ref SortedTableBuilder table, BloomFilter bloom, scoped ReadOnlySpan key, + scoped ReadOnlySpan matching, SelfDestructState sd, RlpStream rlpStream, byte[] rlpBuffer) + where TWriter : IByteBufferWriter + where TView : IByteReaderSource + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct + { + AccountState accountState = AccountState.Absent; + Account? account = null; + for (int k = matching.Length - 1; k >= 0; k--) + { + int src = matching[k]; + TReader r = views[src].CreateReader(); + using TPin pin = r.PinBuffer(enums[src].CurrentValue); + AccountState state = PersistedSnapshotPerAddress.DecodeAccount(pin.Buffer, out Account? decoded); + if (state != AccountState.Absent) + { + accountState = state; + account = decoded; + break; + } + } + + rlpStream.Reset(); + int len = PersistedSnapshotPerAddress.Encode(rlpStream, accountState, account, sd); + table.Add(key, rlpBuffer.AsSpan(0, len)); + bloom.Add(PersistedSnapshotBloomBuilder.AddressKey(PersistedSnapshotKey.PerAddressAddress(key))); + } + + /// Emit the newest source's value for (account / state node / + /// storage node) and add the matching bloom key. + private static void EmitNewest( + ReadOnlySpan views, SortedTableEnumerator[] enums, + ref SortedTableBuilder table, BloomFilter bloom, scoped ReadOnlySpan key, int newest) + where TWriter : IByteBufferWriter + where TView : IByteReaderSource + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct + { + TReader r = views[newest].CreateReader(); + using TPin pin = r.PinBuffer(enums[newest].CurrentValue); + table.Add(key, pin.Buffer); + AddBloomForKey(bloom, key); + } + + private static void AddBloomForKey(BloomFilter bloom, ReadOnlySpan key) + { + switch (key[0]) + { + case PersistedSnapshotKey.RefIdColumn: + break; // ref-id presence records are not bloom-gated + case PersistedSnapshotKey.SlotColumn: + ulong slotAddrKey = PersistedSnapshotBloomBuilder.AddressKey(PersistedSnapshotKey.SlotColumnAddress(key)); + bloom.Add(slotAddrKey); + bloom.Add(PersistedSnapshotBloomBuilder.SlotKey(slotAddrKey, PersistedSnapshotKey.SlotColumnSlot(key))); + break; + case PersistedSnapshotKey.AccountColumn: + bloom.Add(PersistedSnapshotBloomBuilder.AddressKey(PersistedSnapshotKey.PerAddressAddress(key))); + break; + case PersistedSnapshotKey.StorageColumn: + ulong addrHashKey = MemoryMarshal.Read(PersistedSnapshotKey.StorageAddressHash(key)); + bloom.Add(addrHashKey ^ PersistedSnapshotBloomBuilder.StatePathKey(PersistedSnapshotKey.StoragePathBytes(key))); + break; + default: // state-trie node columns + bloom.Add(PersistedSnapshotBloomBuilder.StatePathKey(PersistedSnapshotKey.StatePathBytes(key))); + break; + } + } + + /// + /// Merge metadata: from_block / from_hash from the oldest source, to_block / to_hash / version + /// from the newest, the union of every source's ref_ids, and a noderefs presence marker. + /// + private static void MergeMetadata( + ReadOnlySpan views, ref SortedTableBuilder table) + where TWriter : IByteBufferWriter + where TView : IByteReaderSource + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct + { + int n = views.Length; + TReader oldest = views[0].CreateReader(); + Bound oldestTable = new(0, oldest.Length); + TReader newest = views[n - 1].CreateReader(); + Bound newestTable = new(0, newest.Length); + + // Metadata keys (column 0xFF) are emitted in ascending name order so the streaming builder's + // strict-ascending invariant holds: from_block < from_hash < noderefs < to_block < to_hash < version. + AddMetadataField(ref table, in oldest, oldestTable, PersistedSnapshotTags.MetadataFromBlockKey); + AddMetadataField(ref table, in oldest, oldestTable, PersistedSnapshotTags.MetadataFromHashKey); + + Span noderefsKey = stackalloc byte[1 + PersistedSnapshotTags.MetadataKeyLength]; + int noderefsLen = PersistedSnapshotKey.WriteMetadataKey(noderefsKey, PersistedSnapshotTags.MetadataNodeRefsKey); + table.Add(noderefsKey[..noderefsLen], PersistedSnapshotTags.MetadataNodeRefsPresentMarker); + + AddMetadataField(ref table, in newest, newestTable, PersistedSnapshotTags.MetadataToBlockKey); + AddMetadataField(ref table, in newest, newestTable, PersistedSnapshotTags.MetadataToHashKey); + AddMetadataField(ref table, in newest, newestTable, PersistedSnapshotTags.MetadataVersionKey); + + // ref-id records (column 0x00) are not metadata — they flow through the normal entry merge + // (MergeEntries), which dedups them across sources into the union for free. + } + + private static void AddMetadataField( + ref SortedTableBuilder table, scoped in TReader reader, Bound metaTable, ReadOnlySpan name) + where TWriter : IByteBufferWriter + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct + { + Span key = stackalloc byte[1 + PersistedSnapshotTags.MetadataKeyLength]; + int len = PersistedSnapshotKey.WriteMetadataKey(key, name); + if (SortedTableReader.TrySeek(in reader, metaTable, key[..len], out Bound vb)) + { + using TPin pin = reader.PinBuffer(vb); + table.Add(key[..len], pin.Buffer); + } + } + +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotPerAddress.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotPerAddress.cs new file mode 100644 index 000000000000..c7e51cfebbad --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotPerAddress.cs @@ -0,0 +1,124 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core; +using Nethermind.Serialization.Rlp; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Codec for a persisted snapshot's per-address value: a two-item RLP list +/// [account, selfdestruct] stored under the account-column key +/// (). Folds what used to be two separate sub-tag +/// entries (account and self-destruct) into one self-describing value. +/// +/// +/// Item 0 (account) is three-way, discriminated positionally: a nested slim-account list +/// (, first byte ≥ 0xc0), the single byte 0x00 +/// (), or the empty string 0x80 +/// ( — the address carries a self-destruct record but no account +/// change). Item 1 (self-destruct) is the integer value of (0/1/2); +/// RLP encodes the int 0 as 0x80, which does not clash with item 0's 0x80 +/// because decoding is strictly positional inside the outer list. +/// +internal static class PersistedSnapshotPerAddress +{ + /// Whether the address has an account change in this snapshot, and of what kind. + internal enum AccountState : byte { Absent, Deleted, Present } + + /// Self-destruct disposition; the value is the on-disk item-1 integer. + internal enum SelfDestructState : byte { None = 0, Destructed = 1, New = 2 } + + private const byte DeletedAccountByte = 0x00; + + internal static int GetLength(AccountState accountState, Account? account, SelfDestructState sd) => + Rlp.LengthOfSequence(AccountItemLength(accountState, account) + Rlp.LengthOf((int)sd)); + + private static int AccountItemLength(AccountState accountState, Account? account) => + accountState == AccountState.Present ? AccountDecoder.Slim.GetLength(account) : 1; + + /// Encode the per-address value into (reset by the caller); + /// returns the number of bytes written. + internal static int Encode(RlpStream stream, AccountState accountState, Account? account, SelfDestructState sd) + { + stream.StartSequence(AccountItemLength(accountState, account) + Rlp.LengthOf((int)sd)); + switch (accountState) + { + case AccountState.Present: + AccountDecoder.Slim.Encode(account!, stream); + break; + case AccountState.Deleted: + stream.WriteByte(DeletedAccountByte); + break; + default: // Absent + stream.EncodeEmptyByteArray(); + break; + } + stream.Encode((int)sd); + return stream.Position; + } + + /// Decode the account item, leaving the self-destruct item for + /// . Returns the account kind and, for + /// , the decoded account. + internal static AccountState DecodeAccount(ReadOnlySpan value, out Account? account) + { + Rlp.ValueDecoderContext ctx = new(value); + ctx.ReadSequenceLength(); + return DecodeAccountItem(ref ctx, out account); + } + + /// Account-flavored read mirroring the legacy TryGetAccount contract: + /// false when the address has no account change (); + /// true with null when deleted, otherwise the account. + internal static bool TryDecodeAccount(ReadOnlySpan value, out Account? account) => + DecodeAccount(value, out account) != AccountState.Absent; + + internal static SelfDestructState DecodeSelfDestructState(ReadOnlySpan value) + { + Rlp.ValueDecoderContext ctx = new(value); + ctx.ReadSequenceLength(); + ctx.SkipItem(); + return (SelfDestructState)ctx.DecodeInt(); + } + + /// Map a self-destruct state to the legacy bool? flag: null = none, + /// false = destructed, true = new. + internal static bool? ToFlag(SelfDestructState sd) => sd switch + { + SelfDestructState.None => null, + SelfDestructState.Destructed => false, + _ => true, + }; + + /// Self-destruct flag mirroring the legacy bool? contract: null = none, + /// false = destructed, true = new. + internal static bool? DecodeSelfDestruct(ReadOnlySpan value) => ToFlag(DecodeSelfDestructState(value)); + + /// Decode both items at once (account kind + account + self-destruct). + internal static void Decode(ReadOnlySpan value, out AccountState accountState, out Account? account, out SelfDestructState sd) + { + Rlp.ValueDecoderContext ctx = new(value); + ctx.ReadSequenceLength(); + accountState = DecodeAccountItem(ref ctx, out account); + sd = (SelfDestructState)ctx.DecodeInt(); + } + + private static AccountState DecodeAccountItem(ref Rlp.ValueDecoderContext ctx, out Account? account) + { + if (ctx.IsSequenceNext()) + { + account = AccountDecoder.Slim.Decode(ref ctx); + return AccountState.Present; + } + + account = null; + ReadOnlySpan item = ctx.DecodeByteArraySpan(); + return item.Length switch + { + 0 => AccountState.Absent, + 1 when item[0] == DeletedAccountByte => AccountState.Deleted, + _ => throw new RlpException("Invalid persisted-snapshot per-address account item."), + }; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotReader.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotReader.cs new file mode 100644 index 000000000000..5f8f035d3a08 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotReader.cs @@ -0,0 +1,98 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Int256; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using Nethermind.Trie; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Read-by-key helpers for a persisted snapshot's single-level . Each +/// helper materializes the verbose for the entity and binary +/// searches the table; the returned covers the entity's value, which the caller +/// () materializes. The lookup is clamped to the entity's column via the +/// precalculated . Streaming column scans live in +/// . +/// +public static class PersistedSnapshotReader +{ + /// Seek a materialized , clamping the index search to the key's + /// column via when geometry is available, else a plain whole-table seek. + private static bool Seek(scoped in TReader reader, Bound table, + in PersistedSnapshotColumnBounds bounds, scoped ReadOnlySpan key, out Bound value) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + if (bounds.IsValid) + { + bounds.GetColumnRange(key[0], out long loBlock, out long hiBlock); + SortedTable.Footer footer = bounds.Footer; + return SortedTableReader.TrySeekInColumn(in reader, table, in footer, loBlock, hiBlock, key, out value); + } + return SortedTableReader.TrySeek(in reader, table, key, out value); + } + + internal static bool TryGetAccount(scoped in TReader reader, Bound table, in PersistedSnapshotColumnBounds bounds, Address address, out Bound accountBound) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + Span key = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + int len = PersistedSnapshotKey.WriteAccountKey(key, address.Bytes); + return Seek(in reader, table, in bounds, key[..len], out accountBound); + } + + internal static bool TryGetSlot(scoped in TReader reader, Bound table, in PersistedSnapshotColumnBounds bounds, Address address, in UInt256 index, out Bound slotBound) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + Span slot = stackalloc byte[32]; + index.ToBigEndian(slot); + Span key = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + int len = PersistedSnapshotKey.WriteSlotKey(key, address.Bytes, slot); + return Seek(in reader, table, in bounds, key[..len], out slotBound); + } + + /// null when the address has no self-destruct record in this snapshot, + /// false when destructed, true when newly created. Decoded from the self-destruct + /// item of the per-address value (see ). + internal static bool? TryGetSelfDestructFlag(scoped in TReader reader, Bound table, in PersistedSnapshotColumnBounds bounds, Address address) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + Span key = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + int len = PersistedSnapshotKey.WriteAccountKey(key, address.Bytes); + if (!Seek(in reader, table, in bounds, key[..len], out Bound b) || b.Length == 0) + return null; + int bLen = checked((int)b.Length); + Span buf = bLen <= 256 ? stackalloc byte[256] : new byte[bLen]; + Span value = buf[..bLen]; + if (!reader.TryRead(b.Offset, value)) return null; + return PersistedSnapshotPerAddress.DecodeSelfDestruct(value); + } + + /// + /// Look up a state-trie node by tree path. Returns the value holding a + /// ; the caller decodes it and dereferences into the blob arena. + /// + internal static bool TryLoadStateNodeRlp(scoped in TReader reader, Bound table, in PersistedSnapshotColumnBounds bounds, scoped in TreePath path, out Bound bound) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + Span key = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + int len = PersistedSnapshotKey.WriteStateNodeKey(key, in path); + return Seek(in reader, table, in bounds, key[..len], out bound); + } + + internal static bool TryLoadStorageNodeRlp(scoped in TReader reader, Bound table, in PersistedSnapshotColumnBounds bounds, in ValueHash256 addressHash, in TreePath path, out Bound bound) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + Span key = stackalloc byte[PersistedSnapshotKey.MaxKeyLength]; + int len = PersistedSnapshotKey.WriteStorageNodeKey(key, addressHash.Bytes, in path); + return Seek(in reader, table, in bounds, key[..len], out bound); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotScanner.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotScanner.cs new file mode 100644 index 000000000000..511ad1b37877 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotScanner.cs @@ -0,0 +1,342 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core; +using Nethermind.Core.Crypto; +using Nethermind.Int256; +using Nethermind.Serialization.Rlp; +using Nethermind.State.Flat.Io; +using Nethermind.State.Flat.PersistedSnapshots.Sorted; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.Trie; +using AccountState = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotPerAddress.AccountState; +using SelfDestructState = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotPerAddress.SelfDestructState; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Non-generic entry points for . +/// +public static class PersistedSnapshotScanner +{ + /// + /// A scanner reading through a 's whole-buffer mmap view. The + /// caller owns the session lifetime — it must outlive the returned scanner and any enumerator + /// derived from it. + /// + public static PersistedSnapshotScanner ForWholeRead( + WholeReadSession session, PersistedSnapshot snapshot) => + new(session, snapshot); +} + +/// +/// Streaming scan over a persisted snapshot's single-level , surfacing the +/// same per-address / slot / state-node / storage-node views the prior columnar scanner did. Each view does a full +/// forward pass over the table, skipping the columns it does not own (the columns are contiguous in +/// sorted order). Generic over the byte-reader source so the traversal isn't bound to a specific +/// reader; the caller guarantees the underlying region stays valid for the scanner's lifetime. +/// +public sealed class PersistedSnapshotScanner(TSource source, PersistedSnapshot snapshot) + where TSource : IByteReaderSource + where TReader : IByteReader, allows ref struct + where TPin : struct, IBufferPin, allows ref struct +{ + private readonly TSource _source = source; + private readonly PersistedSnapshot _snapshot = snapshot; + + public PerAddressEnumerable PerAddresses => new(_source.CreateReader()); + public SlotEnumerable Slots => new(_source.CreateReader()); + public StateNodeEnumerable StateNodes => new(_snapshot, _source.CreateReader()); + public StorageNodeEnumerable StorageNodes => new(_snapshot, _source.CreateReader()); + + // ---------------- PerAddress (column 0xFE: Account + SelfDestruct) ---------------- + + public readonly ref struct PerAddressEntry(Address address, bool hasAccount, Account? account, bool? selfDestructFlag) + { + public Address Address { get; } = address; + public bool? SelfDestructFlag { get; } = selfDestructFlag; + public bool HasAccount { get; } = hasAccount; + + /// Decoded account, or null when the per-address value's account item is the + /// deleted marker. Branch on first to tell "no account update in this + /// snapshot" from "explicitly deleted". + public Account? Account { get; } = account; + } + + public readonly ref struct PerAddressEnumerable(TReader reader) + { + private readonly TReader _reader = reader; + public PerAddressEnumerator GetEnumerator() => new(_reader); + } + + public ref struct PerAddressEnumerator : IDisposable + { + private TReader _reader; + private SortedTableEnumerator _inner; + private bool _hasRow; + + private Address? _curAddress; + private bool _hasAccount; + private Account? _account; + private bool? _sdFlag; + + public PerAddressEnumerator(TReader reader) + { + _reader = reader; + _inner = new SortedTableEnumerator(in reader, new Bound(0, reader.Length)); + _hasRow = _inner.MoveNext(in _reader); + } + + public bool MoveNext() + { + // Skip to the next account-column row; stop once we pass it (metadata sorts after). The + // slot column (0xFD) sorts just before the account column, so it is skipped here. + while (_hasRow && _inner.CurrentKey[0] != PersistedSnapshotKey.AccountColumn) + { + if (_inner.CurrentKey[0] > PersistedSnapshotKey.AccountColumn) { _hasRow = false; break; } + _hasRow = _inner.MoveNext(in _reader); + } + if (!_hasRow) return false; + + _curAddress = new Address(PersistedSnapshotKey.PerAddressAddress(_inner.CurrentKey)); + using (TPin pin = _reader.PinBuffer(_inner.CurrentValue)) + { + PersistedSnapshotPerAddress.Decode(pin.Buffer, out AccountState state, out Account? account, out SelfDestructState sd); + _hasAccount = state != AccountState.Absent; + _account = account; + _sdFlag = PersistedSnapshotPerAddress.ToFlag(sd); + } + _hasRow = _inner.MoveNext(in _reader); + return true; + } + + public readonly PerAddressEntry Current => new(_curAddress!, _hasAccount, _account, _sdFlag); + + public void Dispose() { } + } + + // ---------------- Slot (column 0xFD) ---------------- + + public readonly ref struct SlotEntry(TReader reader, ReadOnlySpan addressBytes, ReadOnlySpan slot32, Bound value) + { + private readonly TReader _reader = reader; + private readonly ReadOnlySpan _address = addressBytes; + private readonly ReadOnlySpan _slot = slot32; + private readonly Bound _value = value; + + /// Raw 20-byte address of this slot — zero-allocation; prefer it over + /// in hot scans (e.g. bloom seeding). + public ReadOnlySpan AddressSpan => _address; + + public Address Address => new(_address); + + public UInt256 Slot => new(_slot, isBigEndian: true); + + public SlotValue? Value + { + get + { + if (_value.Length == 0) return null; + using TPin pin = _reader.PinBuffer(_value); + ReadOnlySpan value = new Rlp.ValueDecoderContext(pin.Buffer).DecodeByteArraySpan(); + return SlotValue.FromSpanWithoutLeadingZero(value); + } + } + } + + public readonly ref struct SlotEnumerable(TReader reader) + { + private readonly TReader _reader = reader; + public SlotEnumerator GetEnumerator() => new(_reader); + } + + public ref struct SlotEnumerator : IDisposable + { + private TReader _reader; + private SortedTableEnumerator _inner; + private bool _hasRow; + private bool _returnedRow; + + public SlotEnumerator(TReader reader) + { + _reader = reader; + _inner = new SortedTableEnumerator(in reader, new Bound(0, reader.Length)); + _hasRow = _inner.MoveNext(in _reader); + } + + public bool MoveNext() + { + if (_returnedRow) + { + _hasRow = _inner.MoveNext(in _reader); + _returnedRow = false; + } + while (_hasRow) + { + byte col = _inner.CurrentKey[0]; + if (col == PersistedSnapshotKey.SlotColumn) { _returnedRow = true; return true; } + // Slots (FD) sit between the state columns and the per-address column (FE); once past + // them there is nothing more to yield. + if (col > PersistedSnapshotKey.SlotColumn) { _hasRow = false; break; } + _hasRow = _inner.MoveNext(in _reader); + } + return false; + } + + public readonly SlotEntry Current => new( + _reader, + PersistedSnapshotKey.SlotColumnAddress(_inner.CurrentKey), + PersistedSnapshotKey.SlotColumnSlot(_inner.CurrentKey), + _inner.CurrentValue); + + public void Dispose() { } + } + + // ---------------- StateNode (columns 0xFA/0xFB/0xFC) ---------------- + + public readonly ref struct StateNodeEntry(PersistedSnapshot snapshot, ReadOnlySpan key, Bound value) + { + private readonly PersistedSnapshot _snapshot = snapshot; + private readonly ReadOnlySpan _key = key; + private readonly Bound _value = value; + + public TreePath Path => PersistedSnapshotKey.DecodePath( + PersistedSnapshotKey.StatePathBytes(_key), StateStage(_key[0])); + + public ReadOnlySpan Rlp => _snapshot.ResolveTrieRlp(_value); + } + + public readonly ref struct StateNodeEnumerable(PersistedSnapshot snapshot, TReader reader) + { + private readonly PersistedSnapshot _snapshot = snapshot; + private readonly TReader _reader = reader; + public StateNodeEnumerator GetEnumerator() => new(_snapshot, _reader); + } + + public ref struct StateNodeEnumerator : IDisposable + { + private readonly PersistedSnapshot _snapshot; + private TReader _reader; + private SortedTableEnumerator _inner; + private bool _hasRow; + private bool _returnedRow; + + public StateNodeEnumerator(PersistedSnapshot snapshot, TReader reader) + { + _snapshot = snapshot; + _reader = reader; + _inner = new SortedTableEnumerator(in reader, new Bound(0, reader.Length)); + _hasRow = _inner.MoveNext(in _reader); + } + + public bool MoveNext() + { + if (_returnedRow) + { + _hasRow = _inner.MoveNext(in _reader); + _returnedRow = false; + } + while (_hasRow) + { + byte col = _inner.CurrentKey[0]; + if (col is PersistedSnapshotKey.StateTopColumn or PersistedSnapshotKey.StateCompactColumn or PersistedSnapshotKey.StateFallbackColumn) + { + _returnedRow = true; + return true; + } + // State columns (FA/FB/FC) sit between storage (F9) and slots (FD); once + // past them there is nothing more to yield. + if (col > PersistedSnapshotKey.StateTopColumn) { _hasRow = false; break; } + _hasRow = _inner.MoveNext(in _reader); + } + return false; + } + + public readonly StateNodeEntry Current => new(_snapshot, _inner.CurrentKey, _inner.CurrentValue); + + public void Dispose() { } + } + + // ---------------- StorageNode (column 0xF9) ---------------- + + public readonly ref struct StorageNodeEntry(PersistedSnapshot snapshot, ValueHash256 addressHash, ReadOnlySpan key, Bound value) + { + private readonly PersistedSnapshot _snapshot = snapshot; + private readonly ReadOnlySpan _key = key; + private readonly Bound _value = value; + + public ValueHash256 AddressHash { get; } = addressHash; + + public TreePath Path => PersistedSnapshotKey.DecodePath( + PersistedSnapshotKey.StoragePathBytes(_key), StorageStage(PersistedSnapshotKey.StorageSubColumn(_key))); + + public ReadOnlySpan Rlp => _snapshot.ResolveTrieRlp(_value); + } + + public readonly ref struct StorageNodeEnumerable(PersistedSnapshot snapshot, TReader reader) + { + private readonly PersistedSnapshot _snapshot = snapshot; + private readonly TReader _reader = reader; + public StorageNodeEnumerator GetEnumerator() => new(_snapshot, _reader); + } + + public ref struct StorageNodeEnumerator : IDisposable + { + private readonly PersistedSnapshot _snapshot; + private TReader _reader; + private SortedTableEnumerator _inner; + private bool _hasRow; + private bool _returnedRow; + + public StorageNodeEnumerator(PersistedSnapshot snapshot, TReader reader) + { + _snapshot = snapshot; + _reader = reader; + _inner = new SortedTableEnumerator(in reader, new Bound(0, reader.Length)); + _hasRow = _inner.MoveNext(in _reader); + } + + public bool MoveNext() + { + if (_returnedRow) + { + _hasRow = _inner.MoveNext(in _reader); + _returnedRow = false; + } + while (_hasRow) + { + byte col = _inner.CurrentKey[0]; + if (col == PersistedSnapshotKey.StorageColumn) { _returnedRow = true; return true; } + // Storage (F9) is the first column; once past it there is nothing more to yield. + if (col > PersistedSnapshotKey.StorageColumn) { _hasRow = false; break; } + _hasRow = _inner.MoveNext(in _reader); + } + return false; + } + + public readonly StorageNodeEntry Current + { + get + { + ValueHash256 hash = default; + PersistedSnapshotKey.StorageAddressHash(_inner.CurrentKey).CopyTo(hash.BytesAsSpan); + return new StorageNodeEntry(_snapshot, hash, _inner.CurrentKey, _inner.CurrentValue); + } + } + + public void Dispose() { } + } + + private static int StateStage(byte column) => column switch + { + PersistedSnapshotKey.StateTopColumn => 0, + PersistedSnapshotKey.StateCompactColumn => 1, + _ => 2, + }; + + private static int StorageStage(byte subColumn) => subColumn switch + { + PersistedSnapshotKey.StorageCompactSub => 1, + _ => 2, + }; +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotStack.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotStack.cs new file mode 100644 index 000000000000..09c4e8a14e4b --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotStack.cs @@ -0,0 +1,186 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Diagnostics; +using Nethermind.Core; +using Nethermind.Core.Attributes; +using Nethermind.Core.Crypto; +using Nethermind.Int256; +using Nethermind.State.Flat.Persistence.BloomFilter; +using Nethermind.Trie; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// The persisted-snapshot half of a : a stack of +/// s probed newest-first, each gated by its own +/// before any disk read is paid. +/// +/// +/// Owns the snapshot list — releases it (each snapshot disposes its own +/// bloom). Also owns the detailed metrics recorded around the probe loops: each +/// *_persisted_snapshot hit label and the per-key-kind skip-time observations. +/// +public sealed class PersistedSnapshotStack( + PersistedSnapshotList snapshots, + bool recordDetailedMetrics) : IDisposable +{ + private static readonly StringLabel _readAccountPersistedLabel = new("account_persisted_snapshot"); + private static readonly StringLabel _readStoragePersistedLabel = new("storage_persisted_snapshot"); + private static readonly StringLabel _readStateRlpPersistedLabel = new("state_rlp_persisted_snapshot"); + private static readonly StringLabel _readStorageRlpPersistedLabel = new("storage_rlp_persisted_snapshot"); + + private static readonly StringLabel _skipAccountLabel = new("account"); + private static readonly StringLabel _skipSlotLabel = new("slot"); + private static readonly StringLabel _skipStateRlpLabel = new("state_rlp"); + private static readonly StringLabel _skipStorageRlpLabel = new("storage_rlp"); + + private readonly PersistedSnapshotList _snapshots = snapshots; + private readonly bool _recordDetailedMetrics = recordDetailedMetrics; + + public static PersistedSnapshotStack Empty(bool recordDetailedMetrics = false) => + new(PersistedSnapshotList.Empty(), recordDetailedMetrics); + + public int Count => _snapshots.Count; + + /// true when a snapshot holds an entry for the address — + /// is then the stored account, or null for a + /// deletion marker. false means the caller should fall through to persistence. + public bool TryGetAccount(Address address, out Account? account) + { + // PersistedSnapshot's per-address column is keyed by raw Address; the bloom seed + // also derives from raw Address bytes, so no Keccak round-trip is needed here. + long psw = _recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; + if (_snapshots.Count > 0) + { + ulong addrBloomKey = PersistedSnapshotBloomBuilder.AddressKey(address); + for (int i = _snapshots.Count - 1; i >= 0; i--) + { + if (!_snapshots[i].Bloom.MightContain(addrBloomKey)) continue; + if (_snapshots[i].TryGetAccount(address, out account)) + { + if (_recordDetailedMetrics) Metrics.ReadOnlySnapshotBundleTimes.Observe(Stopwatch.GetTimestamp() - psw, _readAccountPersistedLabel); + return true; + } + } + } + if (_recordDetailedMetrics) Metrics.ReadOnlySnapshotBundleSkipTime.Observe(Stopwatch.GetTimestamp() - psw, _skipAccountLabel); + + account = null; + return false; + } + + /// + /// Find the index (within this stack) of the newest snapshot carrying a self-destruct + /// flag for . + /// + public bool TryGetSelfDestruct(Address address, out int snapshotIdx) + { + if (_snapshots.Count > 0) + { + ulong addrBloomKey = PersistedSnapshotBloomBuilder.AddressKey(address); + for (int i = _snapshots.Count - 1; i >= 0; i--) + { + if (!_snapshots[i].Bloom.MightContain(addrBloomKey)) continue; + bool? flag = _snapshots[i].TryGetSelfDestructFlag(address); + if (flag.HasValue) + { + snapshotIdx = i; + return true; + } + } + } + + snapshotIdx = -1; + return false; + } + + /// + /// Probe the stack newest-first for the storage slot, stopping at the self-destruct + /// boundary. + /// + /// Index (within this stack) of the snapshot holding + /// the newest self-destruct for the address; snapshots at or below it are not probed. + /// Timestamp of the bundle-level lookup start; the hit + /// observation is based here so the recorded time spans the in-memory scan too, + /// matching the label's historical semantics. + /// true when the stack resolved the slot definitively — either a stored + /// value, or null because the self-destruct boundary was reached. false + /// means the caller should fall through to persistence. + public bool TryGetSlot(Address address, in UInt256 index, int selfDestructStateIdx, long lookupStart, out byte[]? value) + { + long psw = _recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; + if (_snapshots.Count > 0) + { + ulong addrBloomKey = PersistedSnapshotBloomBuilder.AddressKey(address); + ulong slotBloomKey = PersistedSnapshotBloomBuilder.SlotKey(addrBloomKey, in index); + for (int i = _snapshots.Count - 1; i >= 0; i--) + { + BloomFilter bloom = _snapshots[i].Bloom; + if (bloom.MightContain(addrBloomKey) && bloom.MightContain(slotBloomKey)) + { + SlotValue slotValue = default; + if (_snapshots[i].TryGetSlot(address, in index, ref slotValue)) + { + if (_recordDetailedMetrics) Metrics.ReadOnlySnapshotBundleTimes.Observe(Stopwatch.GetTimestamp() - lookupStart, _readStoragePersistedLabel); + value = slotValue.ToEvmBytes(); + return true; + } + } + + if (i <= selfDestructStateIdx) + { + value = null; + return true; + } + } + } + if (_recordDetailedMetrics) Metrics.ReadOnlySnapshotBundleSkipTime.Observe(Stopwatch.GetTimestamp() - psw, _skipSlotLabel); + + value = null; + return false; + } + + public bool TryLoadStateRlp(in TreePath path, out byte[]? rlp) + { + long sw = _recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; + ulong statePathBloomKey = PersistedSnapshotBloomBuilder.StatePathKey(in path); + for (int i = _snapshots.Count - 1; i >= 0; i--) + { + if (!_snapshots[i].Bloom.MightContain(statePathBloomKey)) continue; + if (_snapshots[i].TryLoadStateNodeRlp(in path, out rlp)) + { + if (_recordDetailedMetrics) Metrics.ReadOnlySnapshotBundleTimes.Observe(Stopwatch.GetTimestamp() - sw, _readStateRlpPersistedLabel); + return true; + } + } + if (_recordDetailedMetrics) Metrics.ReadOnlySnapshotBundleSkipTime.Observe(Stopwatch.GetTimestamp() - sw, _skipStateRlpLabel); + + rlp = null; + return false; + } + + public bool TryLoadStorageRlp(Hash256 address, in TreePath path, out byte[]? rlp) + { + long sw = _recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; + // Caller already provides the address-hash; convert to the struct ValueHash256 + // (no alloc) so the read path stays Hash256-free below. + ValueHash256 addressHash = address.ValueHash256; + ulong storageBloomKey = PersistedSnapshotBloomBuilder.StorageNodeKey(in addressHash, in path); + for (int i = _snapshots.Count - 1; i >= 0; i--) + { + if (!_snapshots[i].Bloom.MightContain(storageBloomKey)) continue; + if (_snapshots[i].TryLoadStorageNodeRlp(in addressHash, in path, out rlp)) + { + if (_recordDetailedMetrics) Metrics.ReadOnlySnapshotBundleTimes.Observe(Stopwatch.GetTimestamp() - sw, _readStorageRlpPersistedLabel); + return true; + } + } + if (_recordDetailedMetrics) Metrics.ReadOnlySnapshotBundleSkipTime.Observe(Stopwatch.GetTimestamp() - sw, _skipStorageRlpLabel); + + rlp = null; + return false; + } + + public void Dispose() => _snapshots.Dispose(); +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotTags.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotTags.cs new file mode 100644 index 000000000000..5d02949c456a --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotTags.cs @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.PersistedSnapshots; + +/// +/// Shared on-disk vocabulary for the persisted snapshot's single-level sorted table: value-marker +/// bytes, metadata key names, and layout-width constants. The verbose key encoding (column / +/// subcolumn tags stored as 255 − tag) lives in ; this type +/// holds only the format constants that producers (, +/// ) and consumers (, +/// , ) must agree on. +/// +internal static class PersistedSnapshotTags +{ + // Per-addressHash column outer key width — first 20 bytes of Keccak(address). + internal const int AddressHashPrefixLength = 20; + + // The per-address value (account + self-destruct, account column 0xFE) is a two-item RLP list + // encoded/decoded by PersistedSnapshotPerAddress. + + // Metadata key names. NUL-padded to a fixed 10 bytes (the longest original key, "from_block"); + // padding preserves sort order because no original key is a prefix of another. + internal const int MetadataKeyLength = 10; + // Base snapshots only: the contiguous trie-RLP run in the single blob arena they wrote into, + // serialized as a BlobRange; absent on compacted / CompactSized snapshots (BlobRange.None). + internal static readonly byte[] MetadataBlobRangeKey = "blob_range"u8.ToArray(); + internal static readonly byte[] MetadataFromBlockKey = "from_block"u8.ToArray(); + internal static readonly byte[] MetadataFromHashKey = "from_hash\0"u8.ToArray(); + internal static readonly byte[] MetadataNodeRefsKey = "noderefs\0\0"u8.ToArray(); + internal static readonly byte[] MetadataToBlockKey = "to_block\0\0"u8.ToArray(); + internal static readonly byte[] MetadataToHashKey = "to_hash\0\0\0"u8.ToArray(); + internal static readonly byte[] MetadataVersionKey = "version\0\0\0"u8.ToArray(); + + // Referenced blob-arena ids are stored as one record per id (key = ref-id column + id; see + // PersistedSnapshotKey.WriteRefIdKey) rather than a single list value, so they merge/dedup + // through the normal N-way merge and iterate like any other records. This is the per-id value. + internal static readonly byte[] RefIdValue = [0x01]; + + // On-disk format version, written as the value of MetadataVersionKey by the builder and copied + // through by the merger. Bump when the on-disk layout changes. + // v5: single-level sorted table (replaces the columnar format). + // v6: streaming two-level sorted table — i64 footer, index block located by stored byte offset. + // v7: trie-node key encoding aligned to persistence — state top 3-byte, storage drops 4-byte top. + // v8: slots moved out of the per-address account column into their own top-level column (sorts + // just before the account column); the account column now holds only account + self-destruct. + // v9: per-address account and self-destruct folded into one [account, selfdestruct] RLP-list + // value (see PersistedSnapshotPerAddress); the per-address sub-tag is dropped. + internal static readonly byte[] MetadataFormatVersion = [0x09]; + + // Largest RLP encoding of a slot value: a 32-byte string is a 1-byte prefix (0xa0) plus 32 + // bytes. Mirrors BaseFlatPersistence.RlpSlotValueBufferSize. + internal const int RlpSlotValueBufferSize = SlotValue.ByteCount + 1; + + // Presence marker for MetadataNodeRefsKey. The key itself is the signal; the value just + // satisfies the non-empty-value requirement. + internal static readonly byte[] MetadataNodeRefsPresentMarker = [0x01]; +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotUtils.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotUtils.cs new file mode 100644 index 000000000000..7c2d8ad6432a --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/PersistedSnapshotUtils.cs @@ -0,0 +1,148 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Text.Json; +using Nethermind.Core; +using Nethermind.Core.Collections; +using Nethermind.Core.Crypto; +using Nethermind.Core.Extensions; +using Nethermind.Int256; +using Nethermind.Serialization.Rlp; +using Nethermind.Trie; + +namespace Nethermind.State.Flat.PersistedSnapshots; + +internal static class PersistedSnapshotUtils +{ + internal static void DumpSnapshotToJson(Snapshot snapshot, string filename) + { + Dictionary dump = []; + + Dictionary accounts = []; + foreach (KeyValuePair, Account?> kv in snapshot.Accounts) + { + Address address = kv.Key; + accounts[address.Bytes.ToHexString(false)] = kv.Value is null + ? "" + : AccountDecoder.Slim.Encode(kv.Value).Bytes.ToHexString(false); + } + dump["accounts"] = accounts; + + Dictionary storages = []; + foreach (KeyValuePair, SlotValue?> kv in snapshot.Storages) + { + (Address addr, UInt256 slot) = kv.Key.Key; + // Slot serialized as decimal so it survives JSON round-trips without ambiguity. + string key = $"{addr.Bytes.ToHexString(false)}:{slot}"; + storages[key] = kv.Value.HasValue + ? kv.Value.Value.AsReadOnlySpan.ToHexString(false) + : ""; + } + dump["storages"] = storages; + + Dictionary selfDestructed = []; + foreach (KeyValuePair, bool> kv in snapshot.SelfDestructedStorageAddresses) + { + Address address = kv.Key; + selfDestructed[address.Bytes.ToHexString(false)] = kv.Value; + } + dump["selfDestructed"] = selfDestructed; + + Dictionary stateNodes = []; + foreach (KeyValuePair, TrieNode> kv in snapshot.StateNodes) + { + if (kv.Value.FullRlp.Length == 0 && kv.Value.NodeType == NodeType.Unknown) continue; + TreePath path = kv.Key; + string key = $"{path.Span.ToHexString(false)}:{path.Length}"; + stateNodes[key] = kv.Value.FullRlp.AsSpan().ToHexString(false); + } + dump["stateNodes"] = stateNodes; + + Dictionary storageNodes = []; + foreach (KeyValuePair, TrieNode> kv in snapshot.StorageNodes) + { + if (kv.Value.FullRlp.Length == 0 && kv.Value.NodeType == NodeType.Unknown) continue; + (Hash256 hash, TreePath path) = kv.Key.Key; + string key = $"{hash.Bytes.ToHexString(false)}:{path.Span.ToHexString(false)}:{path.Length}"; + storageNodes[key] = kv.Value.FullRlp.AsSpan().ToHexString(false); + } + dump["storageNodes"] = storageNodes; + + File.WriteAllText(filename, JsonSerializer.Serialize(dump)); + } + + internal static void ValidatePersistedSnapshot(Snapshot snapshot, PersistedSnapshot persisted, bool dumpWhenFailed = true) + { + string filename = $"broken.{snapshot.From.BlockNumber}.{snapshot.To.BlockNumber}.json"; + + try + { + foreach (KeyValuePair, Account?> kv in snapshot.Accounts) + { + Address address = kv.Key; + if (!persisted.TryGetAccount(address, out Account? acc)) + throw new InvalidOperationException($"Account {address} not found in persisted snapshot"); + + if (kv.Value is null) + { + if (acc is not null) + throw new InvalidOperationException($"Account {address} should be null but has RLP data"); + } + else + { + if (acc is null || acc.Balance != kv.Value.Balance || acc.Nonce != kv.Value.Nonce + || acc.CodeHash != kv.Value.CodeHash || acc.StorageRoot != kv.Value.StorageRoot) + { + throw new InvalidOperationException($"Account {address} mismatch"); + } + } + } + + foreach (KeyValuePair, SlotValue?> kv in snapshot.Storages) + { + (Address addr, UInt256 slot) = kv.Key.Key; + SlotValue slotValue = default; + if (!persisted.TryGetSlot(addr, slot, ref slotValue)) + throw new InvalidOperationException($"Storage {addr}:{slot} not found in persisted snapshot"); + + SlotValue expected = kv.Value ?? default; + if (!slotValue.AsReadOnlySpan.SequenceEqual(expected.AsReadOnlySpan)) + throw new InvalidOperationException($"Storage {addr}:{slot} mismatch"); + } + + foreach (KeyValuePair, bool> kv in snapshot.SelfDestructedStorageAddresses) + { + Address address = kv.Key; + bool? flag = persisted.TryGetSelfDestructFlag(address) ?? throw new InvalidOperationException($"SelfDestruct {address} not found in persisted snapshot"); + if (flag.Value != kv.Value) + throw new InvalidOperationException($"SelfDestruct {address} mismatch: expected {kv.Value}, got {flag.Value}"); + } + + foreach (KeyValuePair, TrieNode> kv in snapshot.StateNodes) + { + if (kv.Value.FullRlp.Length == 0 && kv.Value.NodeType == NodeType.Unknown) continue; + TreePath path = kv.Key; + if (!persisted.TryLoadStateNodeRlp(in path, out byte[]? nodeRlp)) + throw new InvalidOperationException($"StateNode at path length {path.Length} not found in persisted snapshot"); + if (!nodeRlp!.AsSpan().SequenceEqual(kv.Value.FullRlp.AsSpan())) + throw new InvalidOperationException($"StateNode at path length {path.Length} RLP mismatch"); + } + + foreach (KeyValuePair, TrieNode> kv in snapshot.StorageNodes) + { + if (kv.Value.FullRlp.Length == 0 && kv.Value.NodeType == NodeType.Unknown) continue; + (Hash256 hash, TreePath path) = kv.Key.Key; + ValueHash256 hashStruct = hash.ValueHash256; + if (!persisted.TryLoadStorageNodeRlp(in hashStruct, path, out byte[]? nodeRlp)) + throw new InvalidOperationException($"StorageNode {hash} at path length {path.Length} not found in persisted snapshot"); + if (!nodeRlp!.AsSpan().SequenceEqual(kv.Value.FullRlp.AsSpan())) + throw new InvalidOperationException($"StorageNode {hash} at path length {path.Length} RLP mismatch"); + } + } + catch (InvalidOperationException ex) + { + if (dumpWhenFailed) DumpSnapshotToJson(snapshot, filename); + throw new InvalidOperationException($"{ex.Message}. Dumped snapshot to {filename}", ex); + } + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/Block.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/Block.cs new file mode 100644 index 000000000000..7cbd9e0c3bc6 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/Block.cs @@ -0,0 +1,252 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers.Binary; +using Nethermind.Core.Collections; +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Sorted; + +/// +/// A single, self-describing, binary-searchable block of front-coded key/value records — the shared +/// unit of both the data blocks and the top-level index of a . +/// +/// +/// Wire layout (offsets relative to the block start): +/// +/// [offsetWidth u8] ; W = 2 or 4 bytes +/// [recordsEnd : W] ; block-relative byte offset where records end (content size) +/// [numRestarts : W] +/// [restartOffset : W × numRestarts] ; block-relative; restartOffset[0] = 1 + 2W + W·numRestarts +/// [records...] ; [cp u8][suffixLen u8][keySuffix][vs u8][value] +/// +/// Keys are front-coded against the previous record, resetting (cp = 0, full key) every +/// restartInterval records and at the block start — these are the restarts. The +/// per-block offsetWidth lets a small block (≤ 64 KiB, e.g. a 4 KiB data block) use 2-byte +/// offsets while a large block (e.g. the multi-MB index) uses 4-byte offsets, so one format serves +/// both. binary searches the restarts then scans to +/// recordsEnd for the first key ≥ the target (LevelDB Block::Iter::Seek). +/// +internal static class Block +{ + /// Width of the single-byte record fields (common-prefix, key-suffix size, value size). + internal const int SizePrefix = sizeof(byte); + + internal const byte Width2 = 2; + internal const byte Width4 = 4; + + /// Block-relative byte offset of the first record, given the offset width and restart count. + internal static long RecordsStart(int width, long numRestarts) => 1 + 2L * width + (long)width * numRestarts; + + internal static long ReadOffset(scoped ReadOnlySpan src, int width) => + width == Width2 ? BinaryPrimitives.ReadUInt16LittleEndian(src) : BinaryPrimitives.ReadUInt32LittleEndian(src); +} + +/// +/// Builds one : records are added in ascending key order, front-coded and +/// restart-tracked off-heap, then emitted to a writer at , which picks the +/// narrowest offset width that fits the finished block. +/// +internal sealed class BlockBuilder(int restartInterval, int expectedBytes = 4096) : IDisposable +{ + private readonly NativeMemoryList _body = new(Math.Max(64, expectedBytes)); + private readonly NativeMemoryList _restarts = new(64); + private readonly byte[] _prevKey = new byte[256]; + private int _prevKeyLen; + private int _recordCount; + + public int RecordCount => _recordCount; + + /// Append a record. Keys must arrive in ascending order; key and value lengths ≤ 255. + public void Add(scoped ReadOnlySpan key, scoped ReadOnlySpan value) + { + int cp; + if (_recordCount % restartInterval == 0) + { + _restarts.Add(_body.Count); + cp = 0; + } + else + { + cp = ((ReadOnlySpan)_prevKey.AsSpan(0, _prevKeyLen)).CommonPrefixLength(key); + } + + Span hdr = stackalloc byte[2]; + hdr[0] = (byte)cp; + hdr[1] = (byte)(key.Length - cp); + _body.AddRange(hdr); + _body.AddRange(key[cp..]); + hdr[0] = (byte)value.Length; + _body.AddRange(hdr[..1]); + _body.AddRange(value); + + key.CopyTo(_prevKey); + _prevKeyLen = key.Length; + _recordCount++; + } + + /// Whether adding a record of the given key/value lengths would push the finished block + /// (assuming the 2-byte width that any ≤ 64 KiB block uses) past . + /// Used by the data-block size cap; the index block is never capped. + public bool WouldExceedIfAdded(int keyLen, int valueLen, int contentLimit) + { + int nRestarts = _restarts.Count + (_recordCount % restartInterval == 0 ? 1 : 0); + long header = Block.RecordsStart(Block.Width2, nRestarts); + int recordMax = 2 + keyLen + Block.SizePrefix + valueLen; + return header + _body.Count + recordMax > contentLimit; + } + + /// Emit the finished block to ; returns the bytes written. + public long Finish(ref TWriter writer) where TWriter : IByteBufferWriter + { + int n = _restarts.Count; + int bodyLen = _body.Count; + // bodyLen and n are width-independent, so a single trial-at-2 / fall-to-4 is exact. + long end2 = Block.RecordsStart(Block.Width2, n) + bodyLen; + int width = end2 <= ushort.MaxValue && n <= ushort.MaxValue ? Block.Width2 : Block.Width4; + long recordsStart = Block.RecordsStart(width, n); + long recordsEnd = recordsStart + bodyLen; + + long start = writer.Written; + writer.GetSpan(1)[0] = (byte)width; + writer.Advance(1); + WriteOffset(ref writer, width, recordsEnd); + WriteOffset(ref writer, width, n); + Span rs = _restarts.AsSpan(); + for (int k = 0; k < n; k++) + WriteOffset(ref writer, width, recordsStart + rs[k]); + IByteBufferWriter.Copy(ref writer, _body.AsSpan()); + return writer.Written - start; + } + + public void Reset() + { + _body.Clear(); + _restarts.Clear(); + _prevKeyLen = 0; + _recordCount = 0; + } + + public void Dispose() + { + _body.Dispose(); + _restarts.Dispose(); + } + + private static void WriteOffset(ref TWriter writer, int width, long value) where TWriter : IByteBufferWriter + { + Span dst = writer.GetSpan(width); + if (width == Block.Width2) BinaryPrimitives.WriteUInt16LittleEndian(dst, checked((ushort)value)); + else BinaryPrimitives.WriteUInt32LittleEndian(dst, checked((uint)value)); + writer.Advance(width); + } +} + +/// Read-side search and header parsing for a . +internal static class BlockReader +{ + /// Parse the block header at : offset width, the + /// block-relative records-end, restart count, and the block-relative records start. + internal static bool ReadHeader(scoped in TReader reader, long blockStart, + out int width, out long recordsEnd, out long numRestarts, out long recordsStart) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + width = 0; + recordsEnd = 0; + numRestarts = 0; + recordsStart = 0; + + Span buf = stackalloc byte[4]; + if (!reader.TryRead(blockStart, buf[..1])) return false; + int w = buf[0]; + if (w != Block.Width2 && w != Block.Width4) return false; + if (!reader.TryRead(blockStart + 1, buf[..w])) return false; + recordsEnd = Block.ReadOffset(buf, w); + if (!reader.TryRead(blockStart + 1 + w, buf[..w])) return false; + numRestarts = Block.ReadOffset(buf, w); + width = w; + recordsStart = Block.RecordsStart(w, numRestarts); + return true; + } + + /// + /// Position at the first record whose key ≥ (the ceiling) in the block + /// at : predecessor-restart binary search, then a forward scan to + /// recordsEnd. On a hit copies the ceiling key into and returns + /// its value . Returns false when the block is empty or every key is + /// < . + /// + /// Lower bound (inclusive) restart index for the binary search; defaults + /// to 0. A caller that knows lies within a contiguous restart sub-range + /// (e.g. a single column of a index block) passes it to skip the rest of + /// the search. Clamped into range; the forward scan stays unbounded, so the result is identical to + /// an unclamped search whenever the true predecessor restart is ≥ this value (always the case for an + /// in-range target). + /// Upper bound (inclusive) restart index for the binary search; + /// defaults to the last restart. + internal static bool SeekCeiling(scoped in TReader reader, long blockStart, + scoped ReadOnlySpan target, scoped Span keyBuf, out int keyLen, out Bound value, + long firstRestart = 0, long lastRestartInclusive = long.MaxValue) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + keyLen = 0; + value = default; + if (!ReadHeader(in reader, blockStart, out int width, out long recordsEnd, out long numRestarts, out _)) + return false; + if (numRestarts == 0) return false; + + long restartTableStart = blockStart + 1 + 2L * width; + Span ob = stackalloc byte[4]; + Span hdr = stackalloc byte[2]; + + // Rightmost restart whose first key <= target (cp == 0 there, so the suffix is the full key), + // searched within the caller's clamped restart window. + long loRestart = Math.Clamp(firstRestart, 0, numRestarts - 1); + long hiRestart = Math.Clamp(lastRestartInclusive, loRestart, numRestarts - 1); + long lo = loRestart; + long hi = hiRestart; + long found = -1; + while (lo <= hi) + { + long mid = lo + ((hi - lo) >> 1); + if (!reader.TryRead(restartTableStart + mid * width, ob[..width])) return false; + long recStart = blockStart + Block.ReadOffset(ob, width); + if (!reader.TryRead(recStart, hdr)) return false; + int firstKeyLen = hdr[1]; + using TPin keyPin = reader.PinBuffer(new Bound(recStart + 2, firstKeyLen)); + if (keyPin.Buffer.SequenceCompareTo(target) <= 0) { found = mid; lo = mid + 1; } + else hi = mid - 1; + } + + // target < firstKey(window) ⇒ ceiling is the window's first record; clamp the scan start to it. + long scanRestart = found < 0 ? loRestart : found; + if (!reader.TryRead(restartTableStart + scanRestart * width, ob[..width])) return false; + long pos = blockStart + Block.ReadOffset(ob, width); + long end = blockStart + recordsEnd; + + // Scan forward across restart boundaries (cp = 0 self-corrects) for the first key >= target. + while (pos < end) + { + if (!reader.TryRead(pos, hdr)) return false; + int cp = hdr[0]; + int suffixLen = hdr[1]; + if (!reader.TryRead(pos + 2, keyBuf.Slice(cp, suffixLen))) return false; // keep [0..cp) from prev + int kLen = cp + suffixLen; + + long valueSizeOffset = pos + 2 + suffixLen; + if (!reader.TryRead(valueSizeOffset, hdr[..1])) return false; + int valueLen = hdr[0]; + + if (target.SequenceCompareTo(keyBuf[..kLen]) <= 0) + { + keyLen = kLen; + value = new Bound(valueSizeOffset + Block.SizePrefix, valueLen); + return true; + } + pos = valueSizeOffset + Block.SizePrefix + valueLen; + } + return false; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/FORMAT.md b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/FORMAT.md new file mode 100644 index 000000000000..19b1e6cc6ced --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/FORMAT.md @@ -0,0 +1,93 @@ +# Persisted-snapshot sorted-table format + +A persisted snapshot's metadata blob is a single **two-level sorted table** (`SortedTable`), laid out +like a LevelDB SSTable: a run of 4 KiB-aligned data blocks plus one index block, both using the same +self-describing block format. It replaces the previous columnar format. Trie-node RLP still lives +in separate blob arenas; the table stores only small inline values (account RLP, slot RLP, 6-byte +`NodeRef`s, self-destruct flags, metadata). + +## Layout (within the table's `Bound`, offsets relative to the bound start) + +``` +data block × M ; blocks 0..M-2 zero-padded to BlockSize (4096); data block i at i·BlockSize +index block ; right after the last (unpadded) data block, at the footer's indexOffset; NOT block-aligned; + ; key = separator, value = u32 blockNumber LE +footer ; [count i64][numDataBlocks i64][indexOffset i64][restartInterval u8][version u8] (fixed 26 bytes, read first) + +Block (data and index alike): + [offsetWidth u8] ; W = 2 or 4 bytes + [recordsEnd : W] ; block-relative byte offset where records end (content size) + [numRestarts : W] + [restartOffset : W × numRestarts] ; block-relative; restartOffset[0] = 1 + 2W + W·numRestarts + [records...] ; [cp u8][suffixLen u8][keySuffix][vs u8][value] +``` + +- Both levels reuse one `Block` (`Block.cs`). Within a block, keys are **front-coded**: `cp` is the + number of leading bytes shared with the previous record's key and `keySuffix` is the remaining + `suffixLen` bytes, so the full key = previous key's first `cp` bytes + `keySuffix`. Front-coding + **resets** (`cp = 0`, full key) every `restartInterval` (default **8**) records and at every block + start — these reset points are the **restarts**, and each block prefixes a table of their byte + offsets. The per-block **`offsetWidth`** (`W`) is the narrowest of 2 or 4 bytes that addresses the + finished block: a ≤ 64 KiB data block uses `W = 2`, the multi-MB index uses `W = 4`. `recordsEnd` + lets a block be located by its **start alone** — crucial because data blocks are zero-padded; the + scan/enumeration stops at `recordsEnd` and never reads pad bytes. `cp`, `suffixLen`, and the value + size `vs` are each one byte: keys are ≤ 55 bytes, every inline value is < 255. The one variable-length + datum, the referenced blob-arena id list, is stored as separate records (see below), so no value + overflows. +- Records are **streamed and packed** into data blocks in ascending key order; a data block closes once + the next record would push its content past `BlockSize` (4096). Blocks 0..M-2 are **zero-padded to + 4096** so block `i` sits at `i·BlockSize` and is addressed by **block number** — a `u32` block number + times 4096 reaches a 16 TiB data region. The **last** data block is left unpadded, with the index + block immediately after it. +- The **index block** maps, per data block, the shortest **separator** key in + `[lastKey(block), firstKey(next block))` (the last block's separator is its own last key) to that + block's number. It is located directly by the footer's `indexOffset` (a table-relative byte offset), + so it needs no block-number address and no padding; the i64 footer fields span the full range. +- A lookup (`SortedTableReader`) reads the footer, then does two `BlockReader.SeekCeiling` calls + (LevelDB `Block::Iter::Seek`): (1) ceiling over the **index block** — the first separator ≥ the + target yields the data block number (a target past the last separator misses); (2) ceiling over that + **data block** — the first key ≥ the target; a hit requires that key to **equal** the target. Each + ceiling binary-searches the restarts (rightmost restart whose first key ≤ target, clamped to restart + 0 when the target precedes the block) then scans forward to `recordsEnd`, reconstructing front-coded + keys. O(log M) + O(log restarts) random reads + a short in-page scan; no caching, no per-table bloom. +- The **builder** (`SortedTableBuilder`) requires records in **strictly ascending** key order and + streams them straight into a data `BlockBuilder` (closing + padding at 4096) as they arrive — no + record buffer, so the table size is bounded by the 16 TiB data region rather than by memory. The index + `BlockBuilder` (separator → block number) accrues one entry per flushed data block; only the current + data block and the index are held in memory. Producers (`PersistedSnapshotBuilder`, + `PersistedSnapshotMerger`) therefore emit in ascending key order (see Keys below). +- `version` rejects a blob written by a different format; the catalog version (`SnapshotCatalog`) + gates the whole tier across incompatible changes. + +## Keys (`PersistedSnapshotKey`) + +The table is plain ascending byte-sorted — no custom comparator. To reproduce the columnar reverse-tag +emission order (DenseByteIndex containers wrote tags descending), the **column and subcolumn tag +bytes are stored as `255 − tag`**; entity bytes are natural. Ascending order then is: + +| Entity | Key bytes (tags as 255−v) | Value | +|---|---|---| +| Ref-id | `00` + blobArenaId(2 BE) | `[01]` presence | +| Storage node | `FA` + addrHash(20) + `{FF top, FE compact, FD fallback}` + path | `NodeRef` (6) | +| State node | `{FD top, FC compact, FB fallback}` + path | `NodeRef` (6) | +| Slot | `FE` + addr(20) + `FD` + slot(32 BE) | RLP-wrapped value / empty (deleted) | +| Self-destruct | `FE` + addr(20) + `FE` | `[00]` destructed / `[01]` new | +| Account | `FE` + addr(20) + `FF` | slim account RLP / `[00]` deleted | +| Metadata | `FF` + name(10, NUL-padded) | metadata value | + +Each referenced blob-arena id is its own record under column `00`, which sorts before every real +column — so the ref-ids are the first records and iterate cheaply from the table start +(`PersistedSnapshot`'s ref-id enumerator stops at the first non-`00` record). Within an address: +slots → self-destruct → account. Within an addressHash: fallback → compact → top. Across columns: +ref-ids → storage → state → per-address → metadata. The path encodings (4/8/33-byte) and the +per-bucket ordering are unchanged from the columnar builder/compacter so a future proper columnar serializer +can reuse them. + +## Compaction (`PersistedSnapshotMerger`) + +Each input snapshot is one sorted run. The merge walks them in ascending key order (O(N) find-min), +newest-source-wins per key. Ref-id records dedup through this same merge, yielding the union of +referenced ids for free. Slots are buffered per address and flushed once that address's +self-destruct barrier is known — slots that contributed only from sources older than the newest +destruct are dropped (self-destruct truncation). The remaining metadata (`from_*` from the oldest +source, `to_*`/`version` from the newest, a `noderefs` presence marker) is written separately. diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTable.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTable.cs new file mode 100644 index 000000000000..84a4fc928a93 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTable.cs @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers.Binary; +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Sorted; + +/// +/// Shared wire-format constants and footer helper for the two-level sorted table that backs a +/// persisted snapshot's metadata blob. It is an ascending byte-sorted map of fully-materialized keys +/// to small inline values, laid out as a run of 4 KiB-aligned data blocks +/// addressed by block number, followed by a single index block (separator → block number) and a footer. +/// +/// +/// Layout within a table's (offsets relative to the bound start): +/// +/// data block × M ; blocks 0..M-2 zero-padded to BlockSize (4096); block i at i·BlockSize. +/// The last data block (M-1) is NOT padded — the index follows it immediately. +/// index block ; one Block at byte offset indexOffset; NOT block-aligned (it is located by +/// the footer, not addressed by block number); key = separator, value = u32 block number LE +/// footer ; [count i64][numDataBlocks i64][indexOffset i64][restartInterval u8][version u8] (fixed FooterSize) +/// +/// Each data block holds a slice of the sorted records; the index block maps the shortest separator in +/// [lastKey(block i), firstKey(block i+1)) (the last block's separator is its own last key) to +/// the block number, so a lookup is two calls (index → block +/// number → data block). Data blocks are addressed by number (× BlockSize), so a u32 block number +/// reaches a 16 TiB data region; the single index block is addressed directly by the footer's +/// indexOffset, so it needs no padding and the footer fields are i64 to span the full range. +/// Both data and index blocks are self-describing (see ), so search needs only a +/// block's start. Keys carry the column / subcolumn tag bytes as 255 − tag so a plain ascending +/// sort reproduces the reverse-tag emission order the columnar builder/compacter expect (see +/// ). +/// +internal static class SortedTable +{ + /// Data-block size and alignment — every data block but the last is zero-padded to this and + /// addressed by block number (byte offset = blockNumber · BlockSize). + internal const int BlockSize = PageLayout.PageSize; + + /// Default front-coding restart interval (records per restart run). + internal const int DefaultRestartInterval = 8; + + /// Width of an index block's value — a u32 block number. + internal const int IndexValueSize = sizeof(uint); + + /// Fixed footer: record count (i64), data-block count (i64), index-block byte offset (i64), + /// restart interval (u8), version (u8). + internal const int FooterSize = sizeof(long) + sizeof(long) + sizeof(long) + 1 + 1; + + internal const byte FormatVersion = 6; + + /// Footer-resolved table geometry: total record count, data-block count, the + /// table-relative byte offset of the (unaligned) index block, and the front-coding restart + /// interval (shared by the data and index blocks, so a column's start block number maps to its + /// index-block restart as blockNumber / RestartInterval). + internal readonly record struct Footer(long Count, long NumDataBlocks, long IndexOffset, int RestartInterval); + + /// Reader-absolute start of the index block. + internal static long IndexBlockStart(Bound table, in Footer footer) => table.Offset + footer.IndexOffset; + + /// Reader-absolute start of data block . + internal static long DataBlockStart(Bound table, long blockNumber) => table.Offset + blockNumber * BlockSize; + + /// Read the footer of the table occupying and resolve the record + /// count, data-block count, and index-block offset. + /// false when the bound is too small, unreadable, or carries an unknown version. + internal static bool TryReadFooter(scoped in TReader reader, Bound table, out Footer footer) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + footer = default; + if (table.Length < FooterSize) return false; + + Span buf = stackalloc byte[FooterSize]; + if (!reader.TryRead(table.Offset + table.Length - FooterSize, buf)) return false; + if (buf[FooterSize - 1] != FormatVersion) return false; + + long count = BinaryPrimitives.ReadInt64LittleEndian(buf); + long numDataBlocks = BinaryPrimitives.ReadInt64LittleEndian(buf[sizeof(long)..]); + long indexOffset = BinaryPrimitives.ReadInt64LittleEndian(buf[(2 * sizeof(long))..]); + int restartInterval = buf[3 * sizeof(long)]; + // Bound the fields by the actual table size so a corrupt footer cannot address outside the + // bound: data blocks live in [0, indexOffset) and the index block + footer fill the tail. + if (count < 0 || numDataBlocks < 0 || indexOffset < 0) return false; + if (restartInterval <= 0) return false; + if (numDataBlocks > table.Length / BlockSize + 1) return false; + if (indexOffset > table.Length - FooterSize) return false; + + footer = new Footer(count, numDataBlocks, indexOffset, restartInterval); + return true; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableBuilder.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableBuilder.cs new file mode 100644 index 000000000000..6ca828946752 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableBuilder.cs @@ -0,0 +1,150 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers.Binary; +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Sorted; + +/// +/// Builds a by streaming: records must be ed in strictly +/// ascending key order and are written straight into 4 KiB-aligned data blocks as they arrive — no +/// record buffer, so the table size is bounded by the data region (16 TiB) rather than by an in-memory +/// buffer. The index (separator → block number) accrues one entry per flushed data block; at +/// the final data block and the single index block are emitted, followed by the footer. +/// +/// +/// Both the data blocks and the index reuse . Each finished data block but the +/// last is zero-padded to so block i sits at i·BlockSize +/// and is addressed by block number; the index block is written right after the last (unpadded) data +/// block and located by the footer's indexOffset. The index entry for a block is the shortest +/// separator between that block's last key and the next block's first key (the last block uses its own +/// last key). Only the current data block and the index are buffered. +/// +internal ref struct SortedTableBuilder where TWriter : IByteBufferWriter +{ + private ref TWriter _writer; + private readonly long _tableStart; + private readonly int _restartInterval; + private readonly BlockBuilder _dataBlock; + private readonly BlockBuilder _indexBlock; + // Last key Added overall — also the last key of the current data block, used to enforce ascending + // order and to derive the separator when a block flushes. Keys are ≤ 255 bytes. + private readonly byte[] _prevKey; + private int _prevKeyLen; + // Number of data blocks flushed so far == the block number to assign to the next flushed block. + private long _blockNumber; + private long _count; + + public SortedTableBuilder(ref TWriter writer, int restartInterval = SortedTable.DefaultRestartInterval) + { + _writer = ref writer; + _tableStart = writer.Written; + _restartInterval = restartInterval; + _dataBlock = new BlockBuilder(restartInterval, SortedTable.BlockSize); + _indexBlock = new BlockBuilder(restartInterval); + _prevKey = new byte[256]; + } + + /// Stream one record. Keys must arrive in strictly ascending order and be unique; key and + /// value lengths must each be ≤ 255. + /// The key is not strictly greater than the previous key. + public void Add(scoped ReadOnlySpan key, scoped ReadOnlySpan value) + { + if (_count > 0 && ((ReadOnlySpan)_prevKey.AsSpan(0, _prevKeyLen)).SequenceCompareTo(key) >= 0) + throw new ArgumentException("Keys must be added in strictly ascending order.", nameof(key)); + + if (_dataBlock.RecordCount > 0 && _dataBlock.WouldExceedIfAdded(key.Length, value.Length, SortedTable.BlockSize)) + FlushDataBlock(key); + + _dataBlock.Add(key, value); + key.CopyTo(_prevKey); + _prevKeyLen = key.Length; + _count++; + } + + /// Emit the final data block, the index block, and the footer. + public void Build() + { + if (_dataBlock.RecordCount > 0) FlushDataBlock(nextFirstKey: default); + + // The index block begins right after the last (unpadded) data block; record its offset so the + // reader can locate it directly without recomputing it from the block count. + long indexOffset = _writer.Written - _tableStart; + _indexBlock.Finish(ref _writer); + + Span footer = _writer.GetSpan(SortedTable.FooterSize); + BinaryPrimitives.WriteInt64LittleEndian(footer, _count); + BinaryPrimitives.WriteInt64LittleEndian(footer[sizeof(long)..], _blockNumber); + BinaryPrimitives.WriteInt64LittleEndian(footer[(2 * sizeof(long))..], indexOffset); + footer[3 * sizeof(long)] = (byte)_restartInterval; + footer[3 * sizeof(long) + 1] = SortedTable.FormatVersion; + _writer.Advance(SortedTable.FooterSize); + } + + /// Emit the current data block (4 KiB-padding it unless it is the final block) and record + /// its separator → block number in the index. The separator is the shortest key in + /// [lastKey, nextFirstKey); the final block ( empty) uses its + /// own last key. + private void FlushDataBlock(scoped ReadOnlySpan nextFirstKey) + { + _dataBlock.Finish(ref _writer); + bool isLast = nextFirstKey.IsEmpty; + if (!isLast) PadZeros((-(_writer.Written - _tableStart)) & (SortedTable.BlockSize - 1)); + + Span sepBuf = stackalloc byte[256]; + ReadOnlySpan lastKey = _prevKey.AsSpan(0, _prevKeyLen); + int sepLen; + if (isLast) + { + lastKey.CopyTo(sepBuf); + sepLen = _prevKeyLen; + } + else + { + sepLen = FindShortestSeparator(lastKey, nextFirstKey, sepBuf); + } + + Span blockNumBuf = stackalloc byte[SortedTable.IndexValueSize]; + BinaryPrimitives.WriteUInt32LittleEndian(blockNumBuf, checked((uint)_blockNumber)); + _indexBlock.Add(sepBuf[..sepLen], blockNumBuf); + _blockNumber++; + _dataBlock.Reset(); + } + + private void PadZeros(long count) + { + while (count > 0) + { + int chunk = (int)Math.Min(count, 256); + _writer.GetSpan(chunk)[..chunk].Clear(); + _writer.Advance(chunk); + count -= chunk; + } + } + + /// Shortest key S with S < + /// (caller guarantees < ), written to + /// ; returns its length. Falls back to when it cannot be + /// shortened. + private static int FindShortestSeparator(scoped ReadOnlySpan a, scoped ReadOnlySpan b, scoped Span dst) + { + int min = Math.Min(a.Length, b.Length); + int l = 0; + while (l < min && a[l] == b[l]) l++; + if (l < min && a[l] + 1 < b[l]) + { + a[..l].CopyTo(dst); + dst[l] = (byte)(a[l] + 1); + return l + 1; + } + a.CopyTo(dst); + return a.Length; + } + + public void Dispose() + { + _dataBlock.Dispose(); + _indexBlock.Dispose(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableEnumerator.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableEnumerator.cs new file mode 100644 index 000000000000..fc38abb7bb62 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableEnumerator.cs @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Sorted; + +/// +/// Forward cursor over a in ascending key order. Walks the data blocks in +/// order (block i at i·BlockSize), skipping each block's self-describing header and stopping at +/// its recordsEnd (never the zero-padding), reconstructing front-coded keys (the cp = 0 +/// reset at every restart and block start makes the running key self-correct). A plain struct (not a +/// ref struct) so callers — the N-way merger and the scanner — can hold many in an array; it does not +/// store the reader, taking it via . The current key is copied into an internal +/// buffer so it stays valid across reader-minting calls in the merge. +/// +internal struct SortedTableEnumerator + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct +{ + private readonly long _tableOffset; + private readonly long _numDataBlocks; + private long _blockIdx; + private long _pos; + private long _blockEnd; + private byte[] _keyBuf; + private int _keyLength; + private Bound _value; + + public SortedTableEnumerator(scoped in TReader reader, Bound table) : this(in reader, table, 0) { } + + /// + /// Start the forward scan at data block instead of block 0 — used + /// with to begin near a column boundary without + /// walking the earlier blocks. The first record yielded may precede the seek key within that block; + /// the caller skips down to its column of interest. + /// + public SortedTableEnumerator(scoped in TReader reader, Bound table, long startBlockIdx) + { + // Fixed: keys are ≤ 255 bytes, and the running key must retain its prefix across records. + _keyBuf = new byte[256]; + _tableOffset = table.Offset; + if (SortedTable.TryReadFooter(in reader, table, out SortedTable.Footer footer)) + _numDataBlocks = footer.NumDataBlocks; + // Before startBlockIdx; the first MoveNext loads it (_pos == _blockEnd == 0). + _blockIdx = startBlockIdx - 1; + } + + public bool MoveNext(scoped in TReader reader) + { + // Cross into the next data block(s), skipping each self-describing header. + while (_pos >= _blockEnd) + { + _blockIdx++; + if (_blockIdx >= _numDataBlocks) return false; + long blockStart = _tableOffset + _blockIdx * SortedTable.BlockSize; + if (!BlockReader.ReadHeader(in reader, blockStart, out _, out long recordsEnd, out _, out long recordsStart)) + return false; + _pos = blockStart + recordsStart; + _blockEnd = blockStart + recordsEnd; + } + + Span hdr = stackalloc byte[2]; // [commonPrefix u8][suffixLen u8] + if (!reader.TryRead(_pos, hdr)) return false; + int cp = hdr[0]; + int suffixLen = hdr[1]; + // Front-coded: keep _keyBuf[0..cp) from the previous record, append this record's suffix. + if (!reader.TryRead(_pos + 2, _keyBuf.AsSpan(cp, suffixLen))) return false; + _keyLength = cp + suffixLen; + + long valueSizeOffset = _pos + 2 + suffixLen; + if (!reader.TryRead(valueSizeOffset, hdr[..1])) return false; + int valueLength = hdr[0]; + _value = new Bound(valueSizeOffset + Block.SizePrefix, valueLength); + + _pos = valueSizeOffset + Block.SizePrefix + valueLength; + return true; + } + + public readonly ReadOnlySpan CurrentKey => _keyBuf.AsSpan(0, _keyLength); + public readonly Bound CurrentValue => _value; +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableReader.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableReader.cs new file mode 100644 index 000000000000..0e46e013c3db --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Sorted/SortedTableReader.cs @@ -0,0 +1,120 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers.Binary; +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Sorted; + +/// +/// Lookup over a : a ceiling search of the index block selects a data block +/// number, then a ceiling search of that data block resolves the exact key. Two +/// calls. Wire layout: . +/// +internal static class SortedTableReader +{ + /// + /// Seek in the table occupying . On a hit returns + /// the reader-absolute of the matching record's value. + /// + internal static bool TrySeek(scoped in TReader reader, Bound table, scoped ReadOnlySpan key, out Bound value) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + value = default; + if (!SortedTable.TryReadFooter(in reader, table, out SortedTable.Footer footer) + || footer.NumDataBlocks == 0) + return false; + + // Stage 1: ceiling over the index block — first separator ≥ target → its data block number. + Span sepBuf = stackalloc byte[256]; + if (!BlockReader.SeekCeiling(in reader, SortedTable.IndexBlockStart(table, footer), key, sepBuf, out _, out Bound blockRef)) + return false; + + Span bn = stackalloc byte[SortedTable.IndexValueSize]; + if (!reader.TryRead(blockRef.Offset, bn)) return false; + long blockNumber = BinaryPrimitives.ReadUInt32LittleEndian(bn); + + // Stage 2: ceiling over the data block; a hit requires the ceiling key to equal the target. + Span keyBuf = stackalloc byte[256]; + if (!BlockReader.SeekCeiling(in reader, SortedTable.DataBlockStart(table, blockNumber), key, keyBuf, out int keyLen, out Bound v)) + return false; + if (!key.SequenceEqual(keyBuf[..keyLen])) return false; + + value = v; + return true; + } + + /// + /// Seek using a pre-read and a known column block + /// range, clamping the stage-1 index ceiling search to the index restarts covering data blocks + /// .. (index record i ↔ data block + /// i, both using the footer's restart interval). For any key in the column's tag range this + /// returns exactly what would — only the index search is + /// narrowed and the per-lookup footer read is skipped. + /// + internal static bool TrySeekInColumn(scoped in TReader reader, Bound table, + in SortedTable.Footer footer, long loBlock, long hiBlock, scoped ReadOnlySpan key, out Bound value) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + value = default; + if (footer.NumDataBlocks == 0) return false; + + long firstRestart = loBlock / footer.RestartInterval; + long lastRestart = hiBlock / footer.RestartInterval; + + // Stage 1: ceiling over the index block, restricted to this column's restart window. + Span sepBuf = stackalloc byte[256]; + if (!BlockReader.SeekCeiling(in reader, SortedTable.IndexBlockStart(table, footer), key, sepBuf, out _, out Bound blockRef, firstRestart, lastRestart)) + return false; + + Span bn = stackalloc byte[SortedTable.IndexValueSize]; + if (!reader.TryRead(blockRef.Offset, bn)) return false; + long blockNumber = BinaryPrimitives.ReadUInt32LittleEndian(bn); + + // Stage 2: ceiling over the data block; a hit requires the ceiling key to equal the target. + Span keyBuf = stackalloc byte[256]; + if (!BlockReader.SeekCeiling(in reader, SortedTable.DataBlockStart(table, blockNumber), key, keyBuf, out int keyLen, out Bound v)) + return false; + if (!key.SequenceEqual(keyBuf[..keyLen])) return false; + + value = v; + return true; + } + + /// + /// Resolve the data block number whose range covers using only the stage-1 + /// index-block ceiling search. Lets a caller start a forward + /// scan near a key (e.g. at a column boundary) without walking the table from block 0. + /// + /// false when the table is empty or the footer / index block is unreadable. + internal static bool TryFindStartBlock(scoped in TReader reader, Bound table, scoped ReadOnlySpan key, out long blockNumber) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + blockNumber = 0; + return SortedTable.TryReadFooter(in reader, table, out SortedTable.Footer footer) + && TryFindStartBlock(in reader, table, in footer, key, out blockNumber); + } + + /// + /// Overload taking a pre-read to avoid re-reading it when the + /// caller resolves several column start blocks in one pass. + internal static bool TryFindStartBlock(scoped in TReader reader, Bound table, in SortedTable.Footer footer, scoped ReadOnlySpan key, out long blockNumber) + where TPin : struct, IBufferPin, allows ref struct + where TReader : IByteReader, allows ref struct + { + blockNumber = 0; + if (footer.NumDataBlocks == 0) return false; + + Span sepBuf = stackalloc byte[256]; + if (!BlockReader.SeekCeiling(in reader, SortedTable.IndexBlockStart(table, footer), key, sepBuf, out _, out Bound blockRef)) + return false; + + Span bn = stackalloc byte[SortedTable.IndexValueSize]; + if (!reader.TryRead(blockRef.Offset, bn)) return false; + blockNumber = BinaryPrimitives.ReadUInt32LittleEndian(bn); + return true; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaBufferWriter.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaBufferWriter.cs new file mode 100644 index 000000000000..8d5ecd04db5c --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaBufferWriter.cs @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core.Collections; +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Arena-backed with a 1 MiB write-buffer. +/// +/// +/// The buffer is a held at Count == Capacity, +/// so exposes the whole backing buffer and the +/// writer slices the free tail with its own _buffered cursor. A hint larger than +/// the current buffer grows it by reconstruction (after a flush). +/// +public struct ArenaBufferWriter(Stream stream, long firstOffset) + : IByteBufferWriter, IDisposable +{ + private const int BufferSize = 1024 * 1024; + private const int MaxSizeHint = 8 * 1024 * 1024; // 8 MiB + + private readonly Stream _stream = stream; + private readonly long _firstOffset = firstOffset; + private NativeMemoryList _buffer = new(BufferSize, BufferSize); + private int _buffered; + private long _flushed; + + public Span GetSpan(int sizeHint) + { + ArgumentOutOfRangeException.ThrowIfGreaterThan(sizeHint, MaxSizeHint); + + if (sizeHint > _buffer.Count - _buffered) + { + Flush(); + // Honor the hint exactly: after the flush the buffer is empty and its + // bytes are on the stream, so it can be swapped for a larger one. + if (sizeHint > _buffer.Count) + { + _buffer.Dispose(); + _buffer = new(sizeHint, sizeHint); + } + } + + return _buffer.AsSpan()[_buffered..]; + } + + public void Advance(int count) => _buffered += count; + + public readonly long Written => _flushed + _buffered; + + public readonly long FirstOffset => _firstOffset; + + public void Flush() + { + if (_buffered > 0) + { + _stream.Write(_buffer.AsSpan()[.._buffered]); + _flushed += _buffered; + _buffered = 0; + } + _stream.Flush(); + } + + public void Dispose() + { + Flush(); + _stream.Dispose(); + _buffer.Dispose(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaByteReader.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaByteReader.cs new file mode 100644 index 000000000000..e711ac59723b --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaByteReader.cs @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Runtime.Intrinsics.X86; +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Pointer-backed over an arena-mmap region. +/// Holds a raw byte* + length so the addressed region can exceed +/// 2 GiB (each individual pin still materialises an int-sized ). +/// Each read or pin reports touched OS pages to +/// for pre-fault coalescing. +/// +public unsafe ref struct ArenaByteReader : IByteReader +{ + private readonly byte* _basePtr; + private readonly long _length; + private readonly ArenaReservation _reservation; + private readonly long _baseOffset; + // OS page size is a power of two — mask for the in-page offset / page-base computation. + private readonly long _pageMask; + // Page-aligned absolute address of the last touched range. -1 sentinel = uninitialised. + // Used to skip the per-page Touch loop when a single-page access stays within the same OS + // page as the previous access — the common case for table seeks that re-read sequential + // bytes within one node. + private long _lastPageBase; + + public ArenaByteReader(byte* basePtr, long length, ArenaReservation reservation) + { + ArgumentNullException.ThrowIfNull(reservation); + _basePtr = basePtr; + _length = length; + _reservation = reservation; + _baseOffset = reservation.Offset; + _pageMask = PageLayout.OsPageSize - 1; + _lastPageBase = -1; + } + + public long Length => _length; + + public bool TryRead(long offset, scoped Span output) + { + if ((ulong)offset + (ulong)output.Length > (ulong)_length) return false; + TouchRange(offset, output.Length); + new ReadOnlySpan(_basePtr + offset, output.Length).CopyTo(output); + return true; + } + + public NoOpPin PinBuffer(Bound bound) + { + if ((ulong)bound.Offset + (ulong)bound.Length > (ulong)_length) + throw new ArgumentOutOfRangeException(nameof(bound)); + TouchRange(bound.Offset, bound.Length); + return new NoOpPin(new ReadOnlySpan(_basePtr + bound.Offset, checked((int)bound.Length))); + } + + /// + /// Prefetches the body of a BTree node whose first byte was just read (page + TLB now resident): + /// pulls the two cache lines after the header line so the floor-search's key scan finds them warm. + /// is the node start; line 0 is already cached from the flag-byte read. + /// + public readonly void Prefetch(long offset) + { + if (!Sse.IsSupported || (ulong)offset >= (ulong)_length) return; + byte* p = _basePtr + offset; + Sse.Prefetch0(p + 64); + Sse.Prefetch0(p + 128); + } + + private void TouchRange(long localOffset, long length) + { + if (length <= 0) return; + long absStart = _baseOffset + localOffset; + long absEnd = absStart + length - 1; + long startPageBase = absStart & ~_pageMask; + long endPageBase = absEnd & ~_pageMask; + if (startPageBase == endPageBase && startPageBase == _lastPageBase) return; + _lastPageBase = endPageBase; + + _reservation.TouchRangePopulate(localOffset, length); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaFile.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaFile.cs new file mode 100644 index 000000000000..469081490a0f --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaFile.cs @@ -0,0 +1,311 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Diagnostics.CodeAnalysis; +using System.IO.MemoryMappedFiles; +using System.Runtime.InteropServices; +using Microsoft.Win32.SafeHandles; +using Nethermind.Core.Utils; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// A single append-only arena file for storing persisted snapshot table data. +/// Reads use a read-only mmap for zero-copy access; writes go through a +/// seeked to the target offset. +/// +/// +/// Lifecycle is refcounted: the owning 's dictionary entry +/// holds the initial lease (count 1). Each referencing +/// the file holds an additional lease. The manager drops its lease via +/// (typically through or one of the cancel paths +/// / ); +/// the on-disk file is deleted by when the last lease is released, +/// unless the manager is in shutdown — in which case the file is preserved for the +/// next session. +/// +/// +public sealed unsafe class ArenaFile : RefCountingDisposable +{ + private const int MADV_NORMAL = 0; + private const int MADV_RANDOM = 1; + private const int MADV_DONTNEED = 4; + private const int MADV_POPULATE_READ = 22; + private static readonly nuint PageSize = (nuint)Environment.SystemPageSize; + + [DllImport("libc", EntryPoint = "madvise", SetLastError = true)] + private static extern int Madvise(void* addr, nuint length, int advice); + + private readonly SafeFileHandle _handle; + private MemoryMappedFile _mmf; + private MemoryMappedViewAccessor _accessor; + private byte* _basePtr; + // Treated as bool; 0 = delete on CleanUp, 1 = keep the on-disk file. Set by + // PersistOnShutdown via Interlocked.Exchange so it is safe to call from any path. + private int _preserveOnDispose; + + /// Raw pointer to the first byte of the arena's mmap. Long-offset arithmetic OK across the full . + public byte* BasePtr => _basePtr; + + public int Id { get; } + private string Path { get; } + public long MappedSize { get; private set; } + + /// + /// True for arenas holding sub-CompactSize snapshots (the PersistedBase and + /// PersistedSmallCompacted tiers). Those snapshots are written almost as often as the + /// larger tiers but are demoted right after compaction and rarely read again, so they live in + /// their own files (and their own mutable pool in ) to keep cold, + /// write-heavy data off the hot working set. + /// + public bool Small { get; } + + /// + /// Next-write offset within this arena (in bytes). Set by + /// directly so the manager doesn't have to keep a parallel dict; read by + /// to detect "all bytes dead" and by writer-allocation + /// to choose the next write offset for shared (non-dedicated) arenas. + /// + internal long Frontier { get; set; } + + /// + /// Cumulative bytes marked dead by . When this reaches + /// the arena has no live data and the manager drops it. Per-file + /// state held on the file itself so the manager doesn't keep a parallel dict. + /// + internal long DeadBytes { get; set; } + + /// + /// Last value of reported to Metrics.ArenaAllocatedBytes. + /// Lets push frontier deltas on writer.Complete without + /// keeping a parallel dict and without re-counting bytes it already reported. + /// + internal long ReportedFrontier { get; set; } + + // Push-style gauge updates, called by ArenaManager under its lock at every file add / remove site. + // The bytes gauge tracks **allocated** bytes (Frontier — what's been written), not the pre-extended + // mmap region. + + internal void ReportAdded() + { + Interlocked.Increment(ref Metrics._arenaFileCount); + long frontier = Frontier; + ReportedFrontier = frontier; + if (frontier > 0) + Interlocked.Add(ref Metrics._arenaAllocatedBytes, frontier); + } + + internal void ReportRemoved() + { + Interlocked.Decrement(ref Metrics._arenaFileCount); + long reported = ReportedFrontier; + ReportedFrontier = 0; + if (reported > 0) + Interlocked.Add(ref Metrics._arenaAllocatedBytes, -reported); + } + + public ArenaFile(int id, string path, long mappedSize, bool small = false) + { + Id = id; + Path = path; + MappedSize = mappedSize; + Small = small; + + _handle = File.OpenHandle(path, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.ReadWrite); + + // Extend to mappedSize (sparse on Linux via ftruncate). + if (RandomAccess.GetLength(_handle) < mappedSize) + RandomAccess.SetLength(_handle, mappedSize); + + OpenMmap(mappedSize); + } + + /// + /// Try to acquire a lease without throwing on a disposing file. Returns false when the + /// file is already in cleanup. + /// + internal new bool TryAcquireLease() => base.TryAcquireLease(); + + /// + /// Create a write stream seeked to . + /// The caller is responsible for disposing the returned stream. + /// + internal FileStream CreateWriteStream(long startOffset) + { + FileStream fs = new(Path, FileMode.Open, FileAccess.Write, FileShare.ReadWrite, bufferSize: 1); + fs.Seek(startOffset, SeekOrigin.Begin); + return fs; + } + + /// + /// Shrink the file to in place: close the current mmap view, + /// SetLength on the underlying handle, then reopen the mmap at the new size. + /// Refcount is untouched — the same instance survives across the + /// resize so any reservations capturing it stay valid (pre-resize + /// values are invalidated, but the trim path only runs before any reservation is created + /// against this file). The caller must hold the manager's lock. + /// + internal void Truncate(long newSize) + { + if (newSize == MappedSize) return; + CloseMmap(); + RandomAccess.SetLength(_handle, newSize); + MappedSize = newSize; + OpenMmap(newSize); + } + + [MemberNotNull(nameof(_mmf), nameof(_accessor))] + private void OpenMmap(long size) + { + _mmf = MemoryMappedFile.CreateFromFile(_handle, mapName: null, size, MemoryMappedFileAccess.Read, HandleInheritability.None, leaveOpen: true); + _accessor = _mmf.CreateViewAccessor(0, size, MemoryMappedFileAccess.Read); + _basePtr = null; + _accessor.SafeMemoryMappedViewHandle.AcquirePointer(ref _basePtr); + + if (OperatingSystem.IsLinux()) + Madvise(_basePtr, (nuint)size, MADV_RANDOM); + } + + private void CloseMmap() + { + _accessor.SafeMemoryMappedViewHandle.ReleasePointer(); + _accessor.Dispose(); + _mmf.Dispose(); + _basePtr = null; + } + + public void AdviseDontNeed(long offset, long size) + { + if (!OperatingSystem.IsLinux()) return; + + if (TryAlignInward(offset, size, out nuint start, out nuint len)) + Madvise(_basePtr + start, len, MADV_DONTNEED); + } + + // Round offset up to page boundary, round end down — only cover full pages. + private static bool TryAlignInward(long offset, long size, out nuint start, out nuint len) + { + nuint pageSize = PageSize; + start = ((nuint)offset + pageSize - 1) & ~(pageSize - 1); + nuint end = ((nuint)offset + (nuint)size) & ~(pageSize - 1); + len = end - start; + return end > start; + } + + /// + /// madvise(MADV_POPULATE_READ) on the page-aligned subrange of [offset, offset+size). + /// On Linux ≥ 5.14 the kernel pre-faults the pages so the next read does not block on a page + /// fault. On older kernels the call returns EINVAL, which is benign and ignored. + /// + public void PopulateRead(long offset, long size) + { + if (!OperatingSystem.IsLinux()) return; + + if (TryAlignInward(offset, size, out nuint start, out nuint len)) + Madvise(_basePtr + start, len, MADV_POPULATE_READ); + } + + /// + /// posix_fadvise(POSIX_FADV_DONTNEED) on the underlying file descriptor for the + /// page-aligned subrange of [offset, offset+size). Drops the corresponding + /// pages from the OS file cache. Redundant with on + /// Linux for shared mappings, but useful for benchmarking to ensure arena pages + /// don't pollute the file cache. + /// + public void FadviseDontNeed(long offset, long size) => + PosixReclaim.FadviseDontNeed((int)_handle.DangerousGetHandle(), offset, size); + + /// + /// fallocate(PUNCH_HOLE | KEEP_SIZE) over the page-aligned subrange of + /// [offset, offset + size), freeing the dead range's disk blocks without + /// changing the file length. Punched pages read back as zero through the mmap. + /// + /// The reported by the kernel. + internal PosixReclaim.PunchHoleOutcome PunchHole(long offset, long size) => + PosixReclaim.TryPunchHole((int)_handle.DangerousGetHandle(), offset, size); + + /// + /// fsync(2) the underlying file — block until all previously written bytes are + /// durable on disk. Called by the persisted-snapshot convert/compact paths before the + /// catalog records the new entry so a crash cannot leave the catalog pointing at + /// unsynced pages. + /// + internal void Fsync() => PosixReclaim.Fsync((int)_handle.DangerousGetHandle()); + + /// + /// Open a fresh per-reservation mmap view over [offset, offset+size) with + /// MADV_NORMAL hint, distinct from the global random-access view used by point + /// queries. When is true, disposing the + /// returned view applies MADV_DONTNEED to the range before releasing the + /// mapping; when false the disposer just unmaps. + /// + internal MmapWholeView OpenWholeView(long offset, long size, bool adviseDontNeedOnDispose) + { + MemoryMappedViewAccessor accessor = _mmf.CreateViewAccessor(offset, size, MemoryMappedFileAccess.Read); + byte* ptr = null; + accessor.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr); + // The accessor's pointer is offset by an internal page-aligned skew; add it + // so the span starts at the requested offset's first byte. + byte* dataPtr = ptr + accessor.PointerOffset; + if (OperatingSystem.IsLinux()) + Madvise(dataPtr, (nuint)size, MADV_NORMAL); + return new MmapWholeView(accessor, dataPtr, size, adviseDontNeedOnDispose); + } + + /// + /// A scoped read-only mmap view over a reservation's bytes: a fresh per-reservation accessor with the + /// MADV_NORMAL hint, distinct from the global random-access view used by point queries. When + /// adviseDontNeedOnDispose is set, disposing applies MADV_DONTNEED to the range so the + /// kernel can reclaim those pages from the page cache. + /// + internal sealed unsafe class MmapWholeView( + MemoryMappedViewAccessor accessor, byte* dataPtr, long size, bool adviseDontNeedOnDispose) : IDisposable + { + /// + /// Raw pointer to the first byte of the view. Long-offset arithmetic is valid for the entire + /// range; the mapping is kept alive until . Reservations may + /// exceed , so consume via a pointer-backed reader, not a single Span. + /// + public byte* DataPtr => dataPtr; + public long Size => size; + + public void Dispose() + { + if (adviseDontNeedOnDispose && OperatingSystem.IsLinux()) + { + // MADV_DONTNEED on a file-backed shared mapping drops the pages from the kernel + // page cache, so it also affects the arena's global random-access view (and any + // other mmap of the same file). Intentional: the whole-read session has finished + // sweeping the range and we want those pages out of cache rather than competing + // with the random-access working set. Rounds to full pages around the data range. + nuint pageSize = PageSize; + nuint addr = (nuint)dataPtr; + nuint start = (addr + pageSize - 1) & ~(pageSize - 1); + nuint end = (addr + (nuint)size) & ~(pageSize - 1); + if (end > start) + Madvise((byte*)start, end - start, MADV_DONTNEED); + } + accessor.SafeMemoryMappedViewHandle.ReleasePointer(); + accessor.Dispose(); + } + } + + /// + /// Mark this file as "preserve on disk when its refcount hits zero". Set by + /// via the snapshot's shutdown path + /// so this session's persisted snapshots survive across restarts. Idempotent. + /// + public void PersistOnShutdown() => Interlocked.Exchange(ref _preserveOnDispose, 1); + + protected override void CleanUp() + { + CloseMmap(); + _handle.Dispose(); + // Preserve the on-disk file iff someone explicitly opted in via PersistOnShutdown; + // otherwise delete it (the normal post-prune cleanup path). + if (Volatile.Read(ref _preserveOnDispose) == 0) + { + try { File.Delete(Path); } catch { /* best-effort */ } + } + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaManager.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaManager.cs new file mode 100644 index 000000000000..67dd17389a0f --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaManager.cs @@ -0,0 +1,322 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Collections.Concurrent; +using System.Globalization; +using Nethermind.Db; +using Nethermind.Logging; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Manages multiple arena files for snapshot storage. Handles allocation, +/// reading, and dead space tracking. Writes go through +/// backed by FileStream; reads use mmap. +/// +public sealed class ArenaManager : IArenaManager +{ + private const string ArenaFilePrefix = "arena_"; + private const string SmallArenaFilePrefix = "small_arena_"; + private const string DedicatedArenaFilePrefix = "dedicated_"; + private const string ArenaFileExtension = ".bin"; + + private readonly string _basePath; + private readonly long _maxArenaSize; + private readonly long _dedicatedArenaThreshold; + private readonly bool _punchHoleOnReclaim; + private readonly ILogger _logger; + private readonly ConcurrentDictionary _arenas = new(); + // Shared (non-dedicated) arenas with headroom AND not currently held by a writer. A writer + // reserves a file by removing it from this set; its Complete / Cancel re-adds it if room + // remains. Same pattern as BlobArenaManager. + private readonly HashSet _mutableArenas = []; + // Same pool, but for sub-CompactSize (Small) arenas. Keeping the two tiers in disjoint files + // segregates the cold, write-heavy small snapshots from the hot, long-lived large ones. + private readonly HashSet _mutableSmallArenas = []; + private readonly Lock _lock = new(); + private int _nextArenaId; + private bool _disposed; + // 1 while fallocate(PUNCH_HOLE) is usable on the arena filesystem; latched to 0 the + // first time the kernel reports it permanently unsupported. + private int _punchHoleSupported = 1; + + public ArenaManager(string basePath, IFlatDbConfig config, ILogManager logManager) + { + _basePath = basePath; + _maxArenaSize = config.ArenaFileSizeBytes; + _dedicatedArenaThreshold = config.PersistedSnapshotDedicatedArenaThresholdBytes; + _punchHoleOnReclaim = config.PersistedSnapshotPunchHoleOnReclaim; + _logger = logManager.GetClassLogger(); + Directory.CreateDirectory(basePath); + } + + /// + /// Initialize from existing arena files and catalog entries. + /// Computes allocation frontiers and dead bytes per arena. + /// + public void Initialize(IReadOnlyList entries) + { + using Lock.Scope scope = _lock.EnterScope(); + // Open existing arena files. Defer the per-file metric push until after frontier + // computation so the initial ArenaAllocatedBytes delta reflects the + // catalog-derived high-water mark, not 0. + foreach (string file in Directory.GetFiles(_basePath, $"*{ArenaFileExtension}")) + { + string fileName = Path.GetFileName(file); + // Order matters: "small_arena_" does not start with "arena_", but check the longer/more + // specific prefixes first to keep the classification unambiguous. + string? prefix = + fileName.StartsWith(DedicatedArenaFilePrefix, StringComparison.Ordinal) ? DedicatedArenaFilePrefix + : fileName.StartsWith(SmallArenaFilePrefix, StringComparison.Ordinal) ? SmallArenaFilePrefix + : fileName.StartsWith(ArenaFilePrefix, StringComparison.Ordinal) ? ArenaFilePrefix + : null; + if (prefix is null) continue; + + int arenaId = ParseArenaId(file, prefix); + if (arenaId < 0) continue; + + long fileLength = new FileInfo(file).Length; + long mappedSize = fileLength > 0 ? fileLength : _maxArenaSize; + + ArenaFile arena = new(arenaId, file, mappedSize, small: prefix == SmallArenaFilePrefix); + _arenas[arenaId] = arena; + _nextArenaId = Math.Max(_nextArenaId, arenaId + 1); + } + + // Compute frontiers (max end-offset of any slice referencing the arena) and live + // sizes from the catalog. Entries pointing at arena ids we didn't load on disk are + // dropped — the catalog is the slower-moving authority but the on-disk file set is + // what we can actually serve. The drop signals catalog/disk drift, so warn once per + // missing arena id (not per entry). + Dictionary liveSizes = []; + HashSet missingArenas = []; + foreach (CatalogEntry entry in entries) + { + int aid = entry.Location.ArenaId; + if (!_arenas.TryGetValue(aid, out ArenaFile? arena)) + { + if (missingArenas.Add(aid) && _logger.IsWarn) + _logger.Warn($"Persisted-snapshot catalog references arena {aid} with no on-disk file; dropping its entries."); + continue; + } + long end = entry.Location.Offset + entry.Location.Size; + if (end > arena.Frontier) arena.Frontier = end; + + liveSizes.TryGetValue(aid, out long live); + liveSizes[aid] = live + entry.Location.Size; + } + + // Now that frontiers reflect the catalog's high-water mark, push the per-file count + bytes + // gauges in one go (seeds ReportedFrontier). + foreach (KeyValuePair kv in _arenas) + { + liveSizes.TryGetValue(kv.Key, out long live); + kv.Value.DeadBytes = kv.Value.Frontier - live; + kv.Value.ReportAdded(); + } + } + + /// + /// Create an for buffered writes. The arena is marked as + /// reserved until the writer's or + /// fires. The writer owns the file ref for the + /// duration of the write and signals back via / + /// / . + /// + public ArenaWriter CreateWriter(long estimatedSize, bool small = false) + { + using Lock.Scope scope = _lock.EnterScope(); + bool dedicated = estimatedSize >= _dedicatedArenaThreshold; + ArenaFile file = dedicated + ? CreateArenaFile(estimatedSize, dedicated: true, small: small) + : GetOrCreateArena(estimatedSize, small); + long offset = file.Frontier; + // Reserve: remove from the mutable pool so no concurrent CreateWriter picks the same + // file. OnWriteCompleted / OnWriteCancelledShared re-adds the id if room remains. + // Dedicated files never enter the mutable pool. Route off file.Small (not the small + // arg) so the remove always targets the same pool the file was scanned from. + if (!dedicated) PoolFor(file).Remove(file.Id); + FileStream stream = file.CreateWriteStream(offset); + return new ArenaWriter(this, file, dedicated, offset, stream); + } + + // The mutable pool a shared arena belongs to, chosen by its tier. + private HashSet PoolFor(ArenaFile file) => file.Small ? _mutableSmallArenas : _mutableArenas; + + /// + /// Bookkeeping after . The writer has already set + /// and (if dedicated) called ; + /// the manager does NOT touch the file here. is true for + /// shared writes whose post-frontier still leaves room for further packing. + /// + internal void OnWriteCompleted(ArenaFile file, bool hasHeadroom) + { + using Lock.Scope scope = _lock.EnterScope(); + if (hasHeadroom) PoolFor(file).Add(file.Id); + // Ratchet ArenaAllocatedBytes up to file.Frontier (post-write high-water): push the + // delta since the last report and bring file.ReportedFrontier in sync. + long delta = file.Frontier - file.ReportedFrontier; + if (delta != 0) + { + file.ReportedFrontier = file.Frontier; + Interlocked.Add(ref Metrics._arenaAllocatedBytes, delta); + } + } + + /// + /// Bookkeeping after a cancelled write on a shared (non-dedicated) arena: return the id + /// to the mutable pool (the writer didn't advance the frontier, so by construction it + /// still has the same headroom it had when picked). + /// + internal void OnWriteCancelledShared(ArenaFile file) + { + using Lock.Scope scope = _lock.EnterScope(); + PoolFor(file).Add(file.Id); + } + + /// + /// Bookkeeping after a cancelled write on a dedicated arena. The writer has already + /// dropped the file's manager-ref (triggering → + /// close + delete on disk); the manager just clears its dict / state and updates + /// the byte metric. is readable post-dispose (Id / + /// ReportedFrontier are plain fields). + /// + internal void OnWriteCancelledDedicated(ArenaFile file) + { + using Lock.Scope scope = _lock.EnterScope(); + _arenas.TryRemove(file.Id, out _); + file.ReportRemoved(); + } + + /// + /// Open an existing snapshot location as an for zero-copy reads. + /// Lookup is lock-free against the ; the race + /// with a concurrent tearing the file down is resolved + /// by inside the reservation's ctor — if the file has + /// already started its CleanUp, the ctor surfaces an . + /// + public ArenaReservation Open(in SnapshotLocation location) + { + if (!_arenas.TryGetValue(location.ArenaId, out ArenaFile? arenaFile)) + throw new InvalidOperationException($"Arena {location.ArenaId} is not registered with this manager."); + if (_logger.IsDebug) _logger.Debug($"Reserved arena {location.ArenaId} [{location.Offset}, {location.Offset + location.Size}) ({location.Size} bytes)"); + return new ArenaReservation(this, arenaFile, location.ArenaId, location.Offset, location.Size); + } + + /// + /// Mark bytes of as dead and, if the + /// file's dead-byte total has caught up with its frontier, drop the manager's dict ref so + /// the file self-cleans once its last reservation releases its lease. The caller (typically + /// ) already holds the file ref and handles file-side + /// ops (madvise / posix_fadvise) itself — this method's sole job is the atomic + /// set/dict/metric mutation that needs the manager lock. + /// + /// + /// true if the file survives in the manager; false if this call removed it + /// (all bytes dead) or the manager is disposed. + /// + public bool MarkDead(ArenaFile file, long deadSize) + { + using Lock.Scope scope = _lock.EnterScope(); + // After Dispose, on-disk files must be preserved for the next session — skip + // dead-byte accounting and file deletion entirely. Reporting "not surviving" + // also makes ArenaReservation.CleanUp skip the hole punch, so a file the next + // session rehydrates is never zeroed. + if (_disposed) return false; + // Sole caller is ArenaReservation.CleanUp, so one call == one reservation released. + if (_logger.IsDebug) _logger.Debug($"Released arena reservation on arena {file.Id} ({deadSize} bytes)"); + file.DeadBytes += deadSize; + if (file.DeadBytes < file.Frontier) return true; + PoolFor(file).Remove(file.Id); + if (_arenas.TryRemove(file.Id, out _)) + { + if (_logger.IsDebug) _logger.Debug($"Released arena file {file.Id} (all {file.Frontier} bytes dead)"); + file.ReportRemoved(); + file.Dispose(); + } + return false; + } + + /// + public bool TryPunchHole(ArenaFile file, long offset, long size) + { + if (!_punchHoleOnReclaim || Volatile.Read(ref _punchHoleSupported) == 0) return false; + PosixReclaim.PunchHoleOutcome outcome = file.PunchHole(offset, size); + if (outcome == PosixReclaim.PunchHoleOutcome.Unsupported) + { + // First permanent "unsupported" from the kernel — stop trying on every later cleanup. + Volatile.Write(ref _punchHoleSupported, 0); + } + return outcome == PosixReclaim.PunchHoleOutcome.Done; + } + + /// + /// Whether the adaptive punch-hole support flag is still set — i.e. no + /// filesystem-unsupported error has been seen. Independent of the operator config flag. + /// + internal bool PunchHoleSupported => Volatile.Read(ref _punchHoleSupported) == 1; + + private ArenaFile GetOrCreateArena(long requiredSize, bool small) + { + // Scan the matching mutable pool (none currently held by a writer). Files that can't fit + // are pruned (they become permanently read-only from the manager's POV). + HashSet pool = small ? _mutableSmallArenas : _mutableArenas; + List? toRemove = null; + ArenaFile? result = null; + foreach (int id in pool) + { + ArenaFile candidate = _arenas[id]; + if (candidate.Frontier + requiredSize <= candidate.MappedSize) + { + result = candidate; + break; + } + + (toRemove ??= []).Add(id); + } + + if (toRemove is not null) + { + foreach (int id in toRemove) + pool.Remove(id); + } + + return result ?? CreateArenaFile(small: small); + } + + private ArenaFile CreateArenaFile(long mappedSize = 0, bool dedicated = false, bool small = false) + { + if (mappedSize == 0) mappedSize = _maxArenaSize; + int id = _nextArenaId++; + string prefix = dedicated ? DedicatedArenaFilePrefix : small ? SmallArenaFilePrefix : ArenaFilePrefix; + string path = Path.Combine(_basePath, $"{prefix}{id:D4}{ArenaFileExtension}"); + ArenaFile arena = new(id, path, mappedSize, small); + _arenas[id] = arena; + if (_logger.IsDebug) _logger.Debug($"Created arena file {path} (mapped {mappedSize} bytes{(dedicated ? ", dedicated" : "")}{(small ? ", small" : "")})"); + // Fresh shared file isn't added to _mutableArenas — the writer that just took it + // is its "owner". The writer's Complete / Cancel adds it (if room remains). + arena.ReportAdded(); + return arena; + } + + private static int ParseArenaId(string filePath, string prefix) + { + string fileName = Path.GetFileNameWithoutExtension(filePath); + if (!fileName.StartsWith(prefix, StringComparison.Ordinal)) return -1; + return int.TryParse(fileName.AsSpan(prefix.Length), NumberStyles.None, CultureInfo.InvariantCulture, out int id) ? id : -1; + } + + public void Dispose() + { + // Idempotent — owners higher up may also Dispose us through their own teardown. + using Lock.Scope scope = _lock.EnterScope(); + if (_disposed) return; + _disposed = true; + foreach (KeyValuePair kv in _arenas) + { + kv.Value.ReportRemoved(); + kv.Value.Dispose(); + } + _arenas.Clear(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaReservation.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaReservation.cs new file mode 100644 index 000000000000..ec802b96e717 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaReservation.cs @@ -0,0 +1,174 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core.Utils; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// A reservation of space within an arena. Owns a lease on its and +/// coordinates lifecycle (page-cache reclaim, punch-hole) with the owning +/// on disposal. +/// +public sealed class ArenaReservation : SmallRefCountingDisposable +{ + private readonly IArenaManager _arenaManager; + // The owning file. Held directly so read-path operations skip the manager's id → + // ArenaFile dictionary lookup. + private readonly ArenaFile _arenaFile; + private readonly long _initialSize; + + private int ArenaId { get; } + internal long Offset { get; } + public long Size { get; internal set; } + // Set once via PersistOnShutdown; checked in CleanUp to skip the punch-hole reclaim + // so a snapshot the next session needs to rehydrate is not zeroed on disk. Independent + // of the file-level _preserveOnDispose: a shared arena may still hold other live + // reservations, so the file stays alive regardless — only the punch over THIS + // reservation's range needs to be suppressed. + private int _preserveOnDispose; + + /// + /// On-disk byte footprint of this reservation, page-padded up to where the next + /// reservation begins. For a shared arena is OS-page-aligned and + /// the next reservation starts at Offset + Footprint, so reclamation syscalls + /// (madvise / posix_fadvise / fallocate(PUNCH_HOLE)) over + /// [Offset, Offset + Footprint) cover whole pages exactly without touching a + /// neighbour. Capped at the file so a truncated dedicated arena reduces to . + /// + private long Footprint => Math.Min(PageLayout.RoundUpToOsPage(Size), _arenaFile.MappedSize - Offset); + + public ArenaReservation(IArenaManager arenaManager, ArenaFile arenaFile, + int arenaId, long offset, long size) + : base(1) + { + // Pin the arena file so it can't be torn down while this reservation is alive. + // TryAcquireLease handles the race where the manager removed the file from its + // dict between the caller's lookup and this ctor — surface as InvalidOperationException + // so the caller's lease path can react instead of operating on a doomed file. + if (!arenaFile.TryAcquireLease()) + throw new InvalidOperationException( + $"Cannot construct ArenaReservation for arena {arenaId}: the underlying ArenaFile is already being disposed."); + _arenaManager = arenaManager; + _arenaFile = arenaFile; + ArenaId = arenaId; + Offset = offset; + Size = size; + _initialSize = size; + Interlocked.Increment(ref Metrics._arenaReservationCount); + Interlocked.Add(ref Metrics._arenaReservationBytes, size); + } + + /// + /// Pre-fault the OS pages overlapping the reader-relative byte range + /// [localOffset, localOffset + length): when the range spans more than one OS page, + /// issue a single madvise(MADV_POPULATE_READ) over its page-aligned envelope. + /// + /// + /// Coalesces the per-page pre-fault syscalls into one for a contiguous read. + /// MADV_POPULATE_READ is a no-op on already-resident pages, so over-faulting the few + /// hot pages inside the range is harmless. A single-page range is skipped — a one-page syscall + /// is not amortized vs. the inline minor fault the reader would otherwise take. + /// + internal void TouchRangePopulate(long localOffset, long length) + { + if (length <= 0) return; + int pageSize = Environment.SystemPageSize; + long absStart = Offset + localOffset; + long absEnd = absStart + length; + long firstPageBase = absStart & ~(long)(pageSize - 1); + long lastPageBaseExclusive = (absEnd + pageSize - 1) & ~(long)(pageSize - 1); + int firstPage = (int)(firstPageBase / pageSize); + int lastPage = (int)((lastPageBaseExclusive - 1) / pageSize); + + if (firstPage != lastPage) + _arenaFile.PopulateRead(firstPageBase, lastPageBaseExclusive - firstPageBase); + } + + /// + /// Begin a scoped whole-buffer read. The returned session holds a lease on this + /// reservation; disposing it releases the lease and (by default) issues + /// madvise(MADV_DONTNEED) on the mapped range. Pass + /// = false when the caller has + /// arranged an explicit eviction elsewhere and a redundant madvise on session close + /// would be wasteful. + /// + public WholeReadSession BeginWholeReadSession(bool adviseDontNeedOnDispose = true) => + new(this, adviseDontNeedOnDispose); + + internal ArenaFile.MmapWholeView OpenWholeView(bool adviseDontNeedOnDispose) => + _arenaFile.OpenWholeView(Offset, Size, adviseDontNeedOnDispose); + + /// + /// Construct an over this reservation's bytes. The reader + /// pre-faults the OS pages it reads via . Pointer-backed so + /// >2 GiB reservations are addressable. + /// + public unsafe ArenaByteReader CreateReader() => + new(_arenaFile.BasePtr + Offset, Size, this); + + /// + /// madvise(MADV_DONTNEED) over the reservation's range, dropping the mmap working set + /// without freeing disk blocks. The owning snapshot stays alive and readable; a later read + /// re-faults any dropped page. + /// + public void AdviseDontNeed() => _arenaFile.AdviseDontNeed(Offset, Footprint); + + /// + /// Demote variant of : madvise(MADV_DONTNEED) plus + /// posix_fadvise(POSIX_FADV_DONTNEED) over the reservation's range. Drops both the mmap + /// working set and the OS file-cache pages without freeing disk blocks — unlike + /// it must not punch a hole, because the owning snapshot stays alive and + /// readable. + /// + public void AdviseAndFadviseDontNeed() + { + long footprint = Footprint; + _arenaFile.AdviseDontNeed(Offset, footprint); + _arenaFile.FadviseDontNeed(Offset, footprint); + } + + /// + /// fsync(2) the underlying . Called by the convert/compact + /// paths after the writer's Complete so the freshly-written metadata is durable + /// on disk before the catalog records this reservation. + /// + public void Fsync() => _arenaFile.Fsync(); + + /// + /// Mark this reservation AND its underlying for shutdown-survival. + /// Called by as the + /// snapshot is being marked for survival across the next session. The reservation-level + /// flag suppresses the punch-hole reclaim in ; the file-level flag + /// (set by the forwarded call) suppresses File.Delete in . + /// + public void PersistOnShutdown() + { + Interlocked.Exchange(ref _preserveOnDispose, 1); + _arenaFile.PersistOnShutdown(); + } + + protected override void CleanUp() + { + // File-side ops on the ref we already hold — no manager dict lookup. MarkDead does + // the atomic set/dict/metric bookkeeping; the page-padded Footprint keeps its + // DeadBytes >= Frontier accounting exact for shared arenas. + long footprint = Footprint; + _arenaFile.AdviseDontNeed(Offset, footprint); + bool fileSurvives = _arenaManager.MarkDead(_arenaFile, footprint); + // A reservation flagged PersistOnShutdown must not be punched even when the file + // survives — the next session needs to mmap this exact range. A file MarkDead removed + // is about to be File.Delete'd — punching it is wasted work. A successful punch-hole + // already invalidates the page cache, so the follow-up fadvise is then redundant and + // skipped. + bool preserve = Volatile.Read(ref _preserveOnDispose) == 1; + bool punched = !preserve && fileSurvives && _arenaManager.TryPunchHole(_arenaFile, Offset, footprint); + // Skip the fadvise when the file did not survive — it is about to be deleted on the last lease + // release below, which drops its pages anyway. + if (!punched && fileSurvives) + _arenaFile.FadviseDontNeed(Offset, footprint); + Interlocked.Decrement(ref Metrics._arenaReservationCount); + Interlocked.Add(ref Metrics._arenaReservationBytes, -_initialSize); + _arenaFile.Dispose(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaWriter.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaWriter.cs new file mode 100644 index 000000000000..a4487da19e26 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ArenaWriter.cs @@ -0,0 +1,81 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Buffered writer over an arena slice. Complete and Cancel mutate the +/// (truncate / drop manager-lease) and then notify for dict / metric +/// bookkeeping. +/// +public sealed class ArenaWriter : IDisposable +{ + private ArenaBufferWriter _writer; + private readonly ArenaManager _manager; + private readonly ArenaFile _file; + private readonly bool _dedicated; + private readonly long _startOffset; + private bool _completed; + + internal ArenaWriter(ArenaManager manager, ArenaFile file, bool dedicated, long startOffset, Stream stream) + { + _manager = manager; + _file = file; + _dedicated = dedicated; + _startOffset = startOffset; + long firstOffset = (-startOffset) & PageLayout.PageMask; + _writer = new ArenaBufferWriter(stream, firstOffset); + } + + public ref ArenaBufferWriter GetWriter() => ref _writer; + + public (SnapshotLocation Location, ArenaReservation Reservation) Complete() + { + _writer.Flush(); + _completed = true; + long actualSize = _writer.Written; + long dataEnd = _startOffset + actualSize; + // Shared arenas pack many reservations per file. Pad the frontier up to an OS-page + // boundary so the next reservation starts page-aligned and reclamation syscalls + // (fadvise / fallocate punch-hole) over a reservation cover whole pages exactly. + long newFrontier = _dedicated + ? dataEnd + : Math.Min(PageLayout.RoundUpToOsPage(dataEnd), _file.MappedSize); + _file.Frontier = newFrontier; + + if (_dedicated && newFrontier > 0 && newFrontier < _file.MappedSize) + { + // Dedicated arenas are pre-sized to the writer's estimate; trim the file down + // to the actual frontier so the on-disk length and mmap footprint match what + // was written. Dedicated files reach this path before any reservation is + // constructed against them, so it's safe to shrink the mapping in place. + _file.Truncate(newFrontier); + } + + SnapshotLocation location = new(_file.Id, _startOffset, actualSize); + ArenaReservation reservation = new(_manager, _file, _file.Id, _startOffset, actualSize); + // Dedicated arenas are one-shot — they never return to the mutable pool. Shared + // arenas re-enter the pool iff there's still room for the next packing scan. + bool hasHeadroom = !_dedicated && newFrontier < _file.MappedSize; + _manager.OnWriteCompleted(_file, hasHeadroom); + return (location, reservation); + } + + public void Dispose() + { + _writer.Dispose(); + if (_completed) return; + if (_dedicated) + { + // Drop the manager's count=1 lease — the file's CleanUp closes mmap + handle and + // deletes it on disk. Then notify the manager to clear its dict / metric state; the + // file ref stays readable post-dispose (Id / ReportedFrontier are plain fields). + _file.Dispose(); + _manager.OnWriteCancelledDedicated(_file); + } + else + { + _manager.OnWriteCancelledShared(_file); + } + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaFile.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaFile.cs new file mode 100644 index 000000000000..6cafd1c4657c --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaFile.cs @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Microsoft.Win32.SafeHandles; +using Nethermind.Core.Utils; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// A blob arena file storing trie-node RLP bytes. Owns its +/// and is refcounted: the owning 's array slot holds the +/// initial lease (count 1), the issuing and every leased +/// hold additional ones. The on-disk +/// file is deleted by when the last lease is released, unless +/// was called first — in which case the file is preserved +/// for the next session. +/// +/// +/// Reads use directly: +/// no mmap, no page tracker, no advise — the blob path is pure pread. +/// +/// +/// +/// Owns its own contribution to / +/// : count +1 on +/// construction (plus the initial as allocated bytes for rehydrated +/// files); symmetric -1 / - on . +/// pushes frontier deltas as writes +/// advance. Bytes are reported as **allocated** (Frontier-based), not the pre-extended +/// sparse . +/// +/// +public sealed class BlobArenaFile : RefCountingDisposable +{ + // Treated as bool; 0 = delete on CleanUp, 1 = keep the on-disk file. Set by + // PersistOnShutdown via Interlocked.Exchange so it is safe to call from any path. + private int _preserveOnDispose; + + /// Stable file id, narrowed from int to ushort. Embedded in every . + public ushort BlobArenaId { get; } + + /// On-disk path. Deleted by unless opted in. + private string Path { get; } + + /// Pre-extended file length (sparse on Linux). Writers append within this cap. + public long MaxSize { get; } + + private SafeFileHandle Handle { get; } + + /// Next-write offset. Mutated under the manager's lock during writer registration. + internal long Frontier { get; set; } + + /// + /// Last value of reported to Metrics.BlobAllocatedBytes. + /// Lets push frontier deltas on + /// without re-counting bytes it already reported. + /// + internal long ReportedFrontier { get; set; } + + internal BlobArenaFile(ushort id, string path, long maxSize, long frontier) + { + BlobArenaId = id; + Path = path; + MaxSize = maxSize; + Handle = File.OpenHandle(path, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.ReadWrite); + // File length tracks actual data extent — FileStream.Write auto-extends on demand, + // so we skip the pre-extension ftruncate. Keeping length == Frontier makes + // BlobArenaManager.Initialize's frontier restore accurate (no sparse-tail surprise) + // and lets restored files re-enter the packing pool when they still have headroom. + Frontier = frontier; + ReportedFrontier = frontier; + Interlocked.Increment(ref Metrics._blobFileCount); + if (frontier > 0) + Interlocked.Add(ref Metrics._blobAllocatedBytes, frontier); + } + + /// + /// Mark this file as "preserve on disk when its refcount hits zero". Set by + /// for every blob + /// arena that a still-loaded snapshot references, so the file survives manager + /// teardown and is rehydrated by the next session's . + /// Idempotent. + /// + public void PersistOnShutdown() => Interlocked.Exchange(ref _preserveOnDispose, 1); + + /// + /// True iff has been called for this file. Read by + /// so an orphan-frontier reset + /// does not punch a hole over a file the caller has promised to preserve across + /// the next session — the file would survive on disk, but its bytes would be zeroed. + /// + internal bool IsShutdownPreserved => Volatile.Read(ref _preserveOnDispose) != 0; + + /// + /// Defensive lease acquisition; returns false when the file has already entered + /// . Promotes + /// from protected to internal so the owning manager can lease under its lock. + /// + internal new bool TryAcquireLease() => base.TryAcquireLease(); + + /// + /// True iff the file's refcount is exactly 1 — i.e. the only outstanding lease is + /// the manager's array slot. Used by + /// to detect post-restart orphans (Initialize-loaded files that no snapshot has + /// leased) so the manager can drop its slot and let delete + /// the on-disk file. + /// + internal bool HasOnlyManagerLease => Volatile.Read(ref _leases.Value) == 1; + + /// + /// Read into starting at . + /// Returns the total bytes copied; may be less than destination.Length on a short read at EOF. + /// + public int RandomRead(long offset, Span destination) + { + int total = 0; + while (total < destination.Length) + { + int read = RandomAccess.Read(Handle, destination[total..], offset + total); + if (read <= 0) break; + total += read; + } + return total; + } + + /// + /// Open a write stream seeked to . Caller disposes when done. + /// + internal FileStream OpenWriteStream(long startOffset) + { + FileStream fs = new(Path, FileMode.Open, FileAccess.Write, FileShare.ReadWrite, bufferSize: 1); + fs.Seek(startOffset, SeekOrigin.Begin); + return fs; + } + + /// + /// posix_fadvise(POSIX_FADV_DONTNEED) over [offset, offset + size), + /// dropping the range from the OS file cache. Used when an orphaned file's frontier + /// is reset so the stale, soon-to-be-overwritten bytes don't linger in cache. + /// + internal void FadviseDontNeed(long offset, long size) => + PosixReclaim.FadviseDontNeed((int)Handle.DangerousGetHandle(), offset, size); + + /// + /// posix_fadvise(POSIX_FADV_WILLNEED) over [offset, offset + size), asking + /// the kernel to begin asynchronous read-ahead. Used to bulk-prefetch a base snapshot's + /// contiguous trie-RLP region before a linked CompactSized that references it is scanned. + /// + internal void FadviseWillNeed(long offset, long size) => + PosixReclaim.FadviseWillNeed((int)Handle.DangerousGetHandle(), offset, size); + + /// + /// fsync(2) the underlying file — block until all previously written bytes are + /// durable on disk. Called by the persisted-snapshot convert path before the catalog + /// records the new entry so a crash cannot leave the catalog pointing at unsynced pages. + /// + internal void Fsync() => PosixReclaim.Fsync((int)Handle.DangerousGetHandle()); + + /// + /// ftruncate the underlying file to . Used by + /// with = 0 + /// to reclaim an orphaned file: zeros the logical length AND frees all disk blocks in + /// a single syscall. The page cache for the truncated range is implicitly invalidated. + /// + internal void SetFileLength(long newSize) => + RandomAccess.SetLength(Handle, newSize); + + protected override void CleanUp() + { + Handle.Dispose(); + if (Volatile.Read(ref _preserveOnDispose) == 0) + { + try { File.Delete(Path); } catch { /* best-effort */ } + } + Interlocked.Decrement(ref Metrics._blobFileCount); + long reported = ReportedFrontier; + ReportedFrontier = 0; + if (reported > 0) + Interlocked.Add(ref Metrics._blobAllocatedBytes, -reported); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaManager.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaManager.cs new file mode 100644 index 000000000000..2c99521c52a2 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaManager.cs @@ -0,0 +1,322 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Diagnostics.CodeAnalysis; +using System.Globalization; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// File pool for trie-node RLP bytes, stored back-to-back in its own files, separate from +/// the metadata table arena files held by . A +/// embedded in a persisted snapshot's metadata points at (BlobArenaId, file-absolute +/// offset); the manager resolves the id to the underlying arena file. Standalone — owns +/// its own file pool, with no dependency on . Each known +/// blob file is a refcounted ; the manager's array slot is +/// the file's initial lease (count=1), the writer holds an additional one for the +/// duration of , and each leased +/// takes another. The on-disk file is +/// deleted by the file's own when its refcount hits +/// zero (typically at manager shutdown or in ); the +/// per-file flag overrides delete for files +/// still referenced by loaded snapshots. +/// +/// +/// Wiring convention: FlatWorldStateModule instantiates exactly one +/// (ArenaManager metadata, BlobArenaManager blobs) pair, shared by the +/// persisted-snapshot repository and the compactor. +/// +/// +/// +/// One id per file. A BlobArenaId is the file's stable numeric id +/// (narrowed to ) — many writers across many base snapshots append +/// into the same file over its lifetime; a writer reserves the file by removing it from +/// _mutableFiles and releases it (re-adding) on Complete or Cancel. A new id is +/// only minted when no existing file has headroom; with a typical 1 GiB max file size, +/// the count stays well below 65535. +/// +/// +/// +/// Storage: a flat ?[ushort.MaxValue + 1] array indexed +/// by id. O(1) lookup, no hash, no concurrent-dictionary overhead. Memory footprint: +/// 65 536 × 8 B ≈ 512 KiB per manager. +/// +/// +public sealed class BlobArenaManager : IDisposable +{ + private const string BlobFilePrefix = "blob_"; + private const string BlobFileExtension = ".bin"; + + private readonly string _basePath; + private readonly long _maxFileSize; + private readonly Lock _lock = new(); + // Indexed by blob arena id. Null slot = no file. Reads (TryLeaseFile lookup) are + // unlocked — reference-slot reads are atomic in the CLR memory model. Slot mutations + // (insert / null) happen under _lock alongside _mutableFiles. + private readonly BlobArenaFile?[] _files = new BlobArenaFile?[ushort.MaxValue + 1]; + // Files that still have headroom for further packing AND are not currently held by + // a writer. A writer reserves a file by removing it from this set; Complete / Cancel + // re-add it (if room remains). Protected by _lock. + private readonly HashSet _mutableFiles = []; + private int _nextFileId; + private bool _disposed; + + public BlobArenaManager(string basePath, long maxFileSize) + { + _basePath = basePath; + _maxFileSize = maxFileSize; + Directory.CreateDirectory(basePath); + } + + /// + /// Rehydrate the file pool from on-disk file lengths. Must be called before any + /// is constructed so + /// can resolve ids stored in their ref_ids metadata. + /// + public void Initialize() + { + using Lock.Scope scope = _lock.EnterScope(); + foreach (string path in Directory.GetFiles(_basePath, $"*{BlobFileExtension}")) + { + string name = Path.GetFileName(path); + if (!name.StartsWith(BlobFilePrefix, StringComparison.Ordinal)) continue; + int id = ParseId(name); + if (id < 0 || id > ushort.MaxValue) continue; + long len = new FileInfo(path).Length; + long maxSize = len > 0 ? Math.Max(len, _maxFileSize) : _maxFileSize; + BlobArenaFile file = new((ushort)id, path, maxSize, frontier: len); + _files[id] = file; + _nextFileId = Math.Max(_nextFileId, id + 1); + if (len < _maxFileSize) _mutableFiles.Add((ushort)id); + } + } + + /// + /// Open a writer that appends into an existing arena file with headroom (or a fresh + /// one if none qualifies). The writer holds a lease on the underlying + /// for its lifetime; + /// drops it. The caller takes a separate snapshot lease via + /// before disposing the writer. + /// + public BlobArenaWriter CreateWriter(long estimatedSize) + { + using Lock.Scope scope = _lock.EnterScope(); + if (_disposed) + throw new ObjectDisposedException(nameof(BlobArenaManager)); + + ushort? chosen = null; + List? toRemove = null; + foreach (ushort id in _mutableFiles) + { + BlobArenaFile candidate = _files[id]!; + if (candidate.Frontier + estimatedSize <= candidate.MaxSize) + { + chosen = id; + break; + } + (toRemove ??= []).Add(id); + } + if (toRemove is not null) + foreach (ushort id in toRemove) _mutableFiles.Remove(id); + + ushort fileId; + BlobArenaFile file; + long startOffset; + if (chosen is ushort existing) + { + fileId = existing; + file = _files[fileId]!; + startOffset = file.Frontier; + // Reserve: remove from the mutable set so no concurrent CreateWriter picks it. + // OnWriteCompleted / OnWriteCancelled re-add it if it still has headroom. + _mutableFiles.Remove(fileId); + } + else + { + if (_nextFileId > ushort.MaxValue) + throw new InvalidOperationException( + $"Blob arena file id space exhausted ({ushort.MaxValue + 1} files)."); + fileId = (ushort)_nextFileId++; + string path = Path.Combine(_basePath, $"{BlobFilePrefix}{fileId:D4}{BlobFileExtension}"); + file = new BlobArenaFile(fileId, path, _maxFileSize, frontier: 0); + _files[fileId] = file; + // Fresh file isn't added to _mutableFiles yet — Complete/Cancel adds it. + startOffset = 0; + } + + // The writer's lease keeps the file alive for the write. Mid-cleanup shouldn't happen + // under _lock, but guard against it. + if (!file.TryAcquireLease()) + throw new InvalidOperationException( + $"Blob arena {fileId} is mid-cleanup; cannot open writer."); + + FileStream stream = file.OpenWriteStream(startOffset); + return new BlobArenaWriter(this, file, startOffset, stream); + } + + /// + /// Acquire a lease on the file identified by . Returns + /// false if the manager doesn't know the id, or if the file is mid-cleanup. The + /// caller drops the lease by calling . + /// + public bool TryLeaseFile(ushort blobArenaId, [NotNullWhen(true)] out BlobArenaFile? file) + { + // Lock-free: reference-slot reads are atomic and TryAcquireLease guards the race + // where the file is mid-CleanUp (see the comment on _files). SweepUnreferenced/Dispose + // either land before our read (slot is null) or after our lease (HasOnlyManagerLease + // sees the extra lease and skips). + BlobArenaFile? candidate = _files[blobArenaId]; + if (candidate is null || !candidate.TryAcquireLease()) + { + file = null; + return false; + } + file = candidate; + return true; + } + + /// + /// Return the blob arena file currently registered under , + /// or throw if no slot is populated. Lock-free O(1) array read — the caller MUST already + /// hold a lease on the file (typically acquired via at snapshot + /// load time). Does NOT bump the refcount; used by the hot read path in + /// and by the snapshot's teardown to + /// resolve ids it leased earlier without re-paying the lease-acquisition lock. + /// + public BlobArenaFile GetFile(ushort blobArenaId) => + _files[blobArenaId] + ?? throw new InvalidOperationException( + $"Blob arena {blobArenaId} not registered with this manager."); + + /// + /// Called by after the writer has set the file's + /// new frontier directly. The manager learns whether the id should be a packing + /// candidate for the next writer and pushes the post-write frontier delta to + /// Metrics.BlobAllocatedBytes. + /// + internal void OnWriteCompleted(BlobArenaFile file, bool hasHeadroom) + { + using Lock.Scope scope = _lock.EnterScope(); + if (hasHeadroom) _mutableFiles.Add(file.BlobArenaId); + // Ratchet BlobAllocatedBytes up to file.Frontier: push the delta since the last report + // and bring ReportedFrontier in sync. Bytes are **allocated** (Frontier), not mapped + // (MaxSize) — sparse-file zeros after the frontier are excluded. + long delta = file.Frontier - file.ReportedFrontier; + if (delta != 0) + { + file.ReportedFrontier = file.Frontier; + Interlocked.Add(ref Metrics._blobAllocatedBytes, delta); + } + } + + /// + /// Called by on the cancel path. The writer's + /// frontier didn't advance, so the file still has room by construction — re-add the + /// id to the mutable pool. No file touch. + /// + internal void OnWriteCancelled(ushort blobArenaId) + { + using Lock.Scope scope = _lock.EnterScope(); + _mutableFiles.Add(blobArenaId); + } + + /// + /// Delete arena files that no snapshot referenced after a restart — recoverable + /// orphans from a mid-write crash where Complete never ran (or where the owning + /// snapshot was wiped before restart). Safe to call after every + /// ; + /// no concurrent activity is expected at that point. + /// + public void SweepUnreferenced() + { + using Lock.Scope scope = _lock.EnterScope(); + if (_disposed) return; + for (int id = 0; id < _files.Length; id++) + { + BlobArenaFile? file = _files[id]; + if (file is null) continue; + // File still has external lease(s) — a snapshot loaded it during LoadFromCatalog. + if (!file.HasOnlyManagerLease) continue; + _files[id] = null; + _mutableFiles.Remove((ushort)id); + // Drop the manager's array-slot lease. With no other lease holders the + // file's refcount hits zero, CleanUp runs and deletes the on-disk file + // (preserve flag isn't set — nothing called PersistOnShutdown on this). + file.Dispose(); + } + } + + /// + /// Called by after it has + /// released its lease on a blob file. If only the manager's slot lease remains and + /// the file's frontier is non-zero, reset the frontier to 0 so the bytes gauge drops + /// and the file is reusable for packing from offset 0. No-op when the file still + /// has external lessees. + /// + public void TryResetOrphanedFrontier(BlobArenaFile file) + { + using Lock.Scope scope = _lock.EnterScope(); + if (_disposed) return; + // Slot may already have been replaced (Dispose nulls it out). + if (_files[file.BlobArenaId] != file) return; + // Re-check inside the lock — a racing TryLeaseFile or CreateWriter could + // have bumped the refcount in the window between the caller's + // HasOnlyManagerLease probe and us taking the lock. + if (!file.HasOnlyManagerLease) return; + // PersistedSnapshotRepository.Dispose flags every loaded blob with + // PersistOnShutdown before disposing snapshots. The last snapshot's CleanUp + // arrives here with HasOnlyManagerLease=true — without this guard we'd punch + // a hole over the WHOLE [0, prev) range of a file the next session needs to + // rehydrate intact (BlobArenaFile.CleanUp would keep the file on disk, but + // its bytes would all read as zeros). + if (file.IsShutdownPreserved) return; + long prev = file.ReportedFrontier; + if (prev == 0) + { + _mutableFiles.Add(file.BlobArenaId); + return; + } + + // Take the file out of the packing pool before mutating Frontier, preserving the + // "files in _mutableFiles have a stable Frontier" invariant. Re-added at frontier=0 below. + _mutableFiles.Remove(file.BlobArenaId); + + // Reclaim [0, prev) while still under _lock — a racing CreateWriter would otherwise + // lease this file and append at offset 0, and a truncate over fresh data would corrupt + // it. ftruncate zeros the logical length AND frees all disk blocks in one syscall; the + // page cache for the range is implicitly invalidated. + file.SetFileLength(0); + + file.Frontier = 0; + file.ReportedFrontier = 0; + Interlocked.Add(ref Metrics._blobAllocatedBytes, -prev); + + _mutableFiles.Add(file.BlobArenaId); + } + + public void Dispose() + { + using Lock.Scope scope = _lock.EnterScope(); + if (_disposed) return; + _disposed = true; + for (int id = 0; id < _files.Length; id++) + { + BlobArenaFile? file = _files[id]; + if (file is null) continue; + _files[id] = null; + // Drop the manager's array-slot lease. If a snapshot still holds a lease, + // the file's refcount stays positive; the snapshot's later Dispose triggers + // CleanUp, which honours the PersistOnShutdown flag set by + // PersistedSnapshotRepository.Dispose's first pass. + file.Dispose(); + } + } + + private static int ParseId(string fileName) + { + string noExt = Path.GetFileNameWithoutExtension(fileName); + if (!noExt.StartsWith(BlobFilePrefix, StringComparison.Ordinal)) return -1; + return int.TryParse(noExt.AsSpan(BlobFilePrefix.Length), NumberStyles.None, + CultureInfo.InvariantCulture, out int id) ? id : -1; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaWriter.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaWriter.cs new file mode 100644 index 000000000000..b5828bc20ed4 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobArenaWriter.cs @@ -0,0 +1,183 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core.Collections; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Writer that appends trie-node RLPs into a blob arena file. The returned +/// 's RlpDataOffset is the **file-absolute** offset of the +/// written bytes; many writers across many base snapshots append into the same file +/// over its lifetime, so the id alone is not enough to locate a value. +/// +/// +/// Page-aligned padding: before writing an RLP that would otherwise cross a 4 KiB +/// OS-page boundary, leading pad bytes push the value into the next page. The pad +/// is computed against the file-absolute frontier (files start at offset 0). Trie-node +/// RLP is bounded well below 4 KiB (worst-case branch ≈ 532 bytes), so the simple +/// "pad if it would cross" rule never has to split an oversize value. The pad bytes +/// are inert because the reader recovers value bounds from per-entry length +/// metadata. +/// +/// +/// +/// The 2 GiB-per-file ceiling stays in force — NodeRef.RlpDataOffset is int32. +/// throws when a write +/// would push the file past . By construction +/// only hands out a writer whose target +/// file has headroom for the estimated size, so this throw is a defensive guard +/// against an unusually large RLP late in the writer's life. +/// +/// +public sealed class BlobArenaWriter : IDisposable +{ + private const int BufferSize = 1024 * 1024; + + private readonly BlobArenaManager _manager; + private readonly BlobArenaFile _file; + private readonly ushort _blobArenaId; + private readonly long _startOffset; + private readonly FileStream _stream; + // Held at Count == Capacity so AsSpan() exposes the whole 1 MiB buffer; the writer slices + // the free tail with its own _buffered cursor (same shape as ArenaBufferWriter). + private readonly NativeMemoryList _buffer = new(BufferSize, BufferSize); + private int _buffered; + // File-absolute offset of the next byte to write. Starts at _startOffset (the file's + // frontier when this writer was opened) and advances with each write and any inserted + // pad bytes. The 2 GiB cap is per file: a writer that starts at frontier F can only + // write up to int.MaxValue - F more bytes. + private long _written; + private bool _completed; + private bool _disposed; + + /// + /// The writer holds a lease on acquired by + /// via . + /// Disposal drops the lease via ; if no + /// snapshot picked the file up via in the + /// meantime, the file self-cleans (manager's array-slot ref is still 1, so the file + /// stays alive — it only goes away on manager shutdown or sweep). + /// + internal BlobArenaWriter(BlobArenaManager manager, BlobArenaFile file, long startOffset, FileStream stream) + { + _manager = manager; + _file = file; + _blobArenaId = file.BlobArenaId; + _startOffset = startOffset; + _written = startOffset; + _stream = stream; + } + + /// + /// The blob arena file id embedded in every returned by . + /// + public ushort BlobArenaId => _blobArenaId; + + /// + /// File-absolute offset of the next byte this writer will append (post-padding). + /// + public long Written => _written; + + /// + /// File-absolute offset of the first byte this writer appends — the start of the + /// contiguous RLP region it produces. Equals the file's frontier when the writer opened. + /// + public long StartOffset => _startOffset; + + /// + /// Append to the blob arena file, padding to keep it within a + /// single 4 KiB page when it would otherwise straddle. Returns the + /// that the caller embeds in the metadata table in place of the inline RLP. + /// + public NodeRef WriteRlp(ReadOnlySpan rlp) + { + if (_completed || _disposed) + throw new InvalidOperationException("BlobArenaWriter is closed."); + + long offsetInPage = _written & PageLayout.PageMask; + if (rlp.Length <= PageLayout.PageSize && offsetInPage != 0 && offsetInPage + rlp.Length > PageLayout.PageSize) + { + int pad = (int)(PageLayout.PageSize - offsetInPage); + EnsureBufferSpace(pad)[..pad].Clear(); + _buffered += pad; + _written += pad; + } + + if (_written + rlp.Length > int.MaxValue) + throw new InvalidOperationException( + $"BlobArenaWriter for blob arena {_blobArenaId} would exceed the 2 GiB per-file NodeRef offset ceiling."); + + int offset = (int)_written; + // Trie-node RLP is bounded well below the buffer size (worst-case branch ≈ 532 B), so + // EnsureBufferSpace always returns room for the whole value in one copy. + rlp.CopyTo(EnsureBufferSpace(rlp.Length)); + _buffered += rlp.Length; + _written += rlp.Length; + return new NodeRef(_blobArenaId, offset); + } + + /// + /// Finalise the write: flush the in-memory buffer to the file and register the new + /// frontier with the manager. The writer's own lease on the file is still held — it + /// is released by . + /// takes its own snapshot lease via before + /// this writer is disposed. + /// + public void Complete() + { + if (_completed) throw new InvalidOperationException("BlobArenaWriter already completed."); + FlushBuffer(); + _stream.Flush(); + _stream.Dispose(); + _completed = true; + // Writer mutates the file directly. Manager learns whether the id is still a + // candidate for the next writer's packing scan and pushes the post-write + // frontier delta to the per-tier allocated-bytes gauge. + _file.Frontier = _written; + _manager.OnWriteCompleted(_file, hasHeadroom: _file.Frontier < _file.MaxSize); + } + + /// + /// fsync(2) the underlying blob file. Must be called after + /// — Complete flushes the writer's in-memory buffer through the FileStream; this method + /// blocks until those bytes are durable on disk. Used by the persisted-snapshot convert + /// path on base snapshots before the catalog records the new entry. + /// + public void Fsync() + { + if (!_completed) throw new InvalidOperationException("BlobArenaWriter.Fsync requires Complete first."); + _file.Fsync(); + } + + public void Dispose() + { + if (_disposed) return; + _disposed = true; + if (!_completed) + { + _stream.Dispose(); + // Cancelled mid-write — frontier didn't advance, so the file still has room. + // Manager re-adds the id to the mutable pool without touching the file. + _manager.OnWriteCancelled(_blobArenaId); + } + _buffer.Dispose(); + // Drop the writer's lease on the file. If a snapshot has already picked the file + // up via TryLeaseFile, this just decrements one lease; if nobody else holds a + // lease, the file stays alive on the manager's array-slot ref until shutdown / sweep. + _file.Dispose(); + } + + private Span EnsureBufferSpace(int sizeHint) + { + if (sizeHint > _buffer.Count - _buffered) FlushBuffer(); + return _buffer.AsSpan()[_buffered..]; + } + + private void FlushBuffer() + { + if (_buffered == 0) return; + _stream.Write(_buffer.AsSpan()[.._buffered]); + _buffered = 0; + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobRange.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobRange.cs new file mode 100644 index 000000000000..7665567af6e1 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/BlobRange.cs @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers.Binary; +using System.Runtime.InteropServices; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// The contiguous trie-node RLP region a base persisted snapshot occupies inside one blob +/// arena file. A base snapshot writes every RLP through a single , +/// so its bytes form one [Offset, Offset + Length) run that can be prefetched in a +/// single posix_fadvise(WILLNEED) call. +/// +/// +/// Only base snapshots carry a non-empty range. Compacted / CompactSized snapshots reference +/// scattered blob arenas via ref_ids and store . +/// +[StructLayout(LayoutKind.Sequential, Pack = 1)] +public readonly record struct BlobRange(ushort BlobArenaId, long Offset, long Length) +{ + /// Sentinel for snapshots with no contiguous blob region. + public static readonly BlobRange None = default; + + public bool IsEmpty => Length == 0; + + /// Fixed serialized width of a range: BlobArenaId(2) + Offset(8) + Length(8). + internal const int SerializedSize = sizeof(ushort) + sizeof(long) + sizeof(long); + + /// Serialize this range little-endian into (≥ bytes). + internal void Write(Span span) + { + BinaryPrimitives.WriteUInt16LittleEndian(span, BlobArenaId); + BinaryPrimitives.WriteInt64LittleEndian(span[2..], Offset); + BinaryPrimitives.WriteInt64LittleEndian(span[10..], Length); + } + +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/CatalogEntry.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/CatalogEntry.cs new file mode 100644 index 000000000000..34ee073edc7f --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/CatalogEntry.cs @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// A single catalog entry describing a persisted snapshot's identity, metadata-arena location and +/// persisted . The contiguous blob-RLP region (base snapshots only) lives in +/// the snapshot's own metadata table under the blob_range key, not here. +/// +public sealed record CatalogEntry( + StateId From, + StateId To, + SnapshotLocation Location, + SnapshotTier Tier); diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/IArenaManager.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/IArenaManager.cs new file mode 100644 index 000000000000..02fef30947fe --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/IArenaManager.cs @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +public interface IArenaManager : IDisposable +{ + void Initialize(IReadOnlyList entries); + + /// + /// Create an for a new snapshot slice. + /// + /// Estimated byte size of the slice; drives the shared-vs-dedicated arena choice. + /// + /// true for sub-CompactSize snapshots (PersistedBase / PersistedSmallCompacted), + /// which are packed into their own arena files separate from the larger tiers. + /// + ArenaWriter CreateWriter(long estimatedSize, bool small = false); + ArenaReservation Open(in SnapshotLocation location); + + /// + /// Drop bytes of as dead. The caller + /// (typically ) handles file-side madvise / + /// posix_fadvise itself, so this method only does the atomic set/dict/metric + /// bookkeeping that needs the manager's lock. + /// + /// + /// true if the file survives in the manager (still has live data); false if + /// this call removed it (all bytes dead) or the manager is shutting down. Callers use this + /// to skip disk reclamation on a file that is about to be deleted or preserved. + /// + bool MarkDead(ArenaFile file, long deadSize); + + /// + /// Punch a hole over the [offset, offset + size) range of + /// to free its disk blocks, when both the operator config flag and the adaptive + /// per-manager support flag allow it. The adaptive flag latches off permanently after + /// the first filesystem-unsupported error. No-op for implementations without on-disk arenas. + /// + /// + /// true if the range was actually hole-punched — the kernel has invalidated its + /// page cache, so the caller can skip a follow-up posix_fadvise(DONTNEED); + /// false if punch-hole was skipped (config / adaptive flag) or failed. + /// + bool TryPunchHole(ArenaFile file, long offset, long size); +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ISnapshotCatalog.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ISnapshotCatalog.cs new file mode 100644 index 000000000000..33c2dac29cee --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/ISnapshotCatalog.cs @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Persisted-snapshot metadata catalog: the source of truth for which persisted snapshots exist across +/// restarts. is wired in its place when long finality is disabled. +/// +public interface ISnapshotCatalog +{ + /// Persist a catalog entry, keyed by its (To, depth) tuple. + void Add(CatalogEntry entry); + + /// Remove the entry at (to, depth). Returns true when one was present. + bool Remove(in StateId to, long depth); + + /// Stream all catalog entries (unordered); eagerly version-checks and seeds metadata. + IEnumerable Load(); +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/PosixReclaim.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/PosixReclaim.cs new file mode 100644 index 000000000000..0ea54934e835 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/PosixReclaim.cs @@ -0,0 +1,124 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Runtime.InteropServices; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Thin fd-based wrappers over the Linux fallocate / posix_fadvise syscalls, +/// used to reclaim disk blocks and OS file-cache pages of dead persisted-snapshot arena +/// ranges. Shared by both the metadata arena (, mmap-backed) and the +/// blob arena (, pread-backed). +/// +internal static class PosixReclaim +{ + internal enum PunchHoleOutcome + { + /// The range was hole-punched (or there was nothing to punch). + Done, + + /// The filesystem/kernel permanently does not support hole-punching. + Unsupported, + + /// A transient error — hole-punching may succeed on a later call. + Failed, + } + + private const int FALLOC_FL_KEEP_SIZE = 0x01; + private const int FALLOC_FL_PUNCH_HOLE = 0x02; + private const int POSIX_FADV_DONTNEED = 4; + private const int POSIX_FADV_WILLNEED = 3; + // errno values that mean the call will never succeed on this filesystem/kernel. + private const int ENOSYS = 38; + private const int EOPNOTSUPP = 95; + private static readonly long PageSize = Environment.SystemPageSize; + + [DllImport("libc", EntryPoint = "fallocate", SetLastError = true)] + private static extern int Fallocate(int fd, int mode, long offset, long len); + + [DllImport("libc", EntryPoint = "posix_fadvise", SetLastError = true)] + private static extern int PosixFadvise(int fd, long offset, long len, int advice); + + [DllImport("libc", EntryPoint = "fdatasync", SetLastError = true)] + private static extern int FdatasyncSyscall(int fd); + + /// + /// fdatasync(2) on — block until every byte previously + /// written has reached durable storage. Skips the mtime/ctime metadata flush that + /// fsync(2) would do but still flushes the file size (required for future reads + /// of the auto-grown blob file). No-op on non-Linux (test environments only — + /// durability matters on the production Linux target). Throws + /// on errno. + /// + internal static void Fsync(int fd) + { + if (!OperatingSystem.IsLinux()) return; + if (FdatasyncSyscall(fd) == 0) return; + int err = Marshal.GetLastPInvokeError(); + throw new IOException($"fdatasync failed: errno {err}"); + } + + /// + /// posix_fadvise(POSIX_FADV_DONTNEED) over the page-aligned subrange of + /// [offset, offset + size), dropping it from the OS file cache. No-op on + /// non-Linux; fire-and-forget (the errno is not inspected). + /// + internal static void FadviseDontNeed(int fd, long offset, long size) + { + if (!OperatingSystem.IsLinux()) return; + (long start, long len) = AlignInward(offset, size); + if (len <= 0) return; + PosixFadvise(fd, start, len, POSIX_FADV_DONTNEED); + } + + /// + /// posix_fadvise(POSIX_FADV_WILLNEED) over [offset, offset + size), asking + /// the kernel to start asynchronous read-ahead for the range. No-op on non-Linux; + /// fire-and-forget (the errno is not inspected). + /// + /// + /// Unlike the range is passed unaligned: WILLNEED + /// must cover the whole region (including the partial pages at either end), and + /// the kernel page-aligns the request internally. Inward alignment would shave the first + /// and last page — a base snapshot's region boundaries are not page-aligned. + /// + internal static void FadviseWillNeed(int fd, long offset, long size) + { + if (!OperatingSystem.IsLinux()) return; + if (size <= 0) return; + PosixFadvise(fd, offset, size, POSIX_FADV_WILLNEED); + } + + /// + /// fallocate(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) over the page-aligned + /// subrange of [offset, offset + size), freeing the underlying disk blocks + /// without changing the file length. A successful punch also invalidates the OS page + /// cache for the range, so a follow-up posix_fadvise(DONTNEED) is unnecessary. + /// + /// + /// on success (or an empty range); + /// on non-Linux or a permanent + /// EOPNOTSUPP / ENOSYS; on any + /// other (transient) errno. + /// + internal static PunchHoleOutcome TryPunchHole(int fd, long offset, long size) + { + if (!OperatingSystem.IsLinux()) return PunchHoleOutcome.Unsupported; + (long start, long len) = AlignInward(offset, size); + if (len <= 0) return PunchHoleOutcome.Done; + if (Fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, start, len) == 0) + return PunchHoleOutcome.Done; + int err = Marshal.GetLastPInvokeError(); + return err is EOPNOTSUPP or ENOSYS ? PunchHoleOutcome.Unsupported : PunchHoleOutcome.Failed; + } + + // Round offset up and end down to OS-page boundaries so only fully-covered pages are + // touched — prevents a hole punch from zeroing a partial page shared with a neighbouring reservation. + private static (long start, long len) AlignInward(long offset, long size) + { + long start = (offset + PageSize - 1) & ~(PageSize - 1); + long end = (offset + size) & ~(PageSize - 1); + return (start, end - start); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/SnapshotCatalog.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/SnapshotCatalog.cs new file mode 100644 index 000000000000..951176f62936 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/SnapshotCatalog.cs @@ -0,0 +1,157 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Buffers.Binary; +using Nethermind.Core.Crypto; +using Nethermind.Db; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Persists snapshot metadata in a key-value store (RocksDB column or MemDb). +/// Each entry is keyed by its 48-byte tuple (To.BlockNumber, To.StateRoot, depth) +/// — 8-byte big-endian block number, 32-byte state root, 8-byte big-endian depth +/// (To.BlockNumber - From.BlockNumber). The depth disambiguates entries that +/// share the same To across the three runtime buckets (base, compacted, +/// CompactSized) so each survives independently across a restart. The reserved 4-byte +/// key stores the catalog-version word; entry keys are 48 bytes, so the lengths +/// cannot collide. +/// +public sealed class SnapshotCatalog(IDb db) : ISnapshotCatalog +{ + // Binary layout per entry: fromBlock(8) + fromRoot(32) + toBlock(8) + toRoot(32) + + // arenaId(4) + offset(8) + size(8) + tier(1) = 101 + private const int EntrySize = 101; + + private const int KeySize = 48; + + // Catalog version: bumped when the on-disk binary layout changes incompatibly. Old + // directories will fail to load with a clear "wipe and resync" message. + // v2: persisted-snapshot metadata switched from the columnar format to the single-level + // sorted table — the old metadata blobs are unreadable by the new reader. + // v3: sorted table moved to a sparse (per-8-record) offset index, 1-byte key/value sizes, and + // per-id ref-id records — incompatible with the v2 dense-offset layout. + // v4: sorted-table keys are front-coded (per-block prefix compression) — incompatible record + // layout vs v3. + // v5: sorted table became two-level — 4 KB data blocks with an in-block restart table and a + // tail separator-key index — incompatible with the v4 single-level sparse-offset layout. + // v6: sorted table reuses one self-describing block format for both levels; data blocks are + // 4 KiB-aligned and addressed by block number, and the index is a single block (separator → + // block number) — incompatible with the v5 byte-offset tail index. + // v7: sorted-table footer widened to i64 fields and the (unaligned) index block is located by a + // stored byte offset instead of being recomputed from the block count — incompatible footer. + // v8: trie-node key encoding aligned to the persistence layout — state top tier is 3-byte (path + // length 0-5) and storage drops its 4-byte top tier (0-15 use the 8-byte compact encoding). + private const int CurrentVersion = 8; + + private static readonly byte[] MetadataKey = new byte[4]; + + private readonly IDb _db = db; + + public void Add(CatalogEntry entry) + { + Span key = stackalloc byte[KeySize]; + WriteKey(key, entry.To, Depth(entry)); + byte[] value = new byte[EntrySize]; + WriteEntry(value, entry); + _db.Set(key, value); + } + + public bool Remove(in StateId to, long depth) + { + Span key = stackalloc byte[KeySize]; + WriteKey(key, to, depth); + if (!_db.KeyExists(key)) return false; + _db.Remove(key); + return true; + } + + private static long Depth(CatalogEntry entry) => entry.To.BlockNumber - entry.From.BlockNumber; + + /// + /// Streams catalog entries lazily (unordered). The version check and first-write of the + /// metadata word happen eagerly before the iterator is returned, not on enumeration. + /// + public IEnumerable Load() + { + byte[]? meta = _db.Get(MetadataKey); + if (meta is not null) + { + if (meta.Length != 4) + throw new InvalidOperationException( + $"Persisted snapshot catalog metadata has unexpected length {meta.Length} (expected 4). " + + "The persisted_snapshot/ directory has an incompatible layout — wipe and resync."); + + int version = BinaryPrimitives.ReadInt32LittleEndian(meta); + if (version != CurrentVersion) + throw new InvalidOperationException( + $"Persisted snapshot catalog version mismatch: on-disk v{version}, runtime expects v{CurrentVersion}. " + + "The persisted_snapshot/ directory has an incompatible layout — wipe and resync."); + } + else + { + WriteMetadata(); + } + + return EnumerateEntries(); + } + + private IEnumerable EnumerateEntries() + { + foreach (KeyValuePair kv in _db.GetAll(ordered: false)) + { + // Entry keys are exactly KeySize; the metadata key is 4 bytes. + if (kv.Key.Length != KeySize) continue; + if (kv.Value is null || kv.Value.Length != EntrySize) continue; + yield return ReadEntry(kv.Value); + } + } + + private void WriteMetadata() + { + byte[] value = new byte[4]; + BinaryPrimitives.WriteInt32LittleEndian(value, CurrentVersion); + _db.Set(MetadataKey, value); + } + + private static void WriteKey(Span span, in StateId to, long depth) + { + BinaryPrimitives.WriteInt64BigEndian(span, to.BlockNumber); + to.StateRoot.BytesAsSpan.CopyTo(span[8..]); + BinaryPrimitives.WriteInt64BigEndian(span[40..], depth); + } + + private static void WriteEntry(Span span, CatalogEntry entry) + { + BinaryPrimitives.WriteInt64LittleEndian(span, entry.From.BlockNumber); + entry.From.StateRoot.BytesAsSpan.CopyTo(span[8..]); + BinaryPrimitives.WriteInt64LittleEndian(span[40..], entry.To.BlockNumber); + entry.To.StateRoot.BytesAsSpan.CopyTo(span[48..]); + BinaryPrimitives.WriteInt32LittleEndian(span[80..], entry.Location.ArenaId); + BinaryPrimitives.WriteInt64LittleEndian(span[84..], entry.Location.Offset); + BinaryPrimitives.WriteInt64LittleEndian(span[92..], entry.Location.Size); + span[100] = (byte)entry.Tier; + } + + private static CatalogEntry ReadEntry(ReadOnlySpan span) + { + long fromBlock = BinaryPrimitives.ReadInt64LittleEndian(span); + ValueHash256 fromRoot = new(span.Slice(8, 32)); + StateId from = new(fromBlock, fromRoot); + + long toBlock = BinaryPrimitives.ReadInt64LittleEndian(span[40..]); + ValueHash256 toRoot = new(span.Slice(48, 32)); + StateId to = new(toBlock, toRoot); + + int arenaId = BinaryPrimitives.ReadInt32LittleEndian(span[80..]); + long offset = BinaryPrimitives.ReadInt64LittleEndian(span[84..]); + long size = BinaryPrimitives.ReadInt64LittleEndian(span[92..]); + SnapshotTier tier = (SnapshotTier)span[100]; + if (!tier.IsPersisted()) + throw new InvalidOperationException( + $"Persisted snapshot catalog entry has non-persisted tier byte {span[100]} (only Persisted* tiers are ever stored). " + + "The persisted_snapshot/ directory has an incompatible or corrupted layout — wipe and resync."); + + return new CatalogEntry(from, to, new SnapshotLocation(arenaId, offset, size), tier); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/SnapshotLocation.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/SnapshotLocation.cs new file mode 100644 index 000000000000..801d21ce4ba1 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/SnapshotLocation.cs @@ -0,0 +1,6 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +public readonly record struct SnapshotLocation(int ArenaId, long Offset, long Size); diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/WholeReadSession.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/WholeReadSession.cs new file mode 100644 index 000000000000..f5943cf2bd37 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/WholeReadSession.cs @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// Scoped whole-buffer view over an . Opens a fresh +/// per-reservation mmap view with MADV_NORMAL hint (distinct from the global +/// random-access view used by point queries) and acquires a lease on the reservation. +/// Disposing releases the lease; when adviseDontNeedOnDispose is true it +/// also issues madvise(MADV_DONTNEED) on the range so the kernel can reclaim those +/// pages from the page cache. +/// +/// +/// Also serves as the for the reservation: +/// the mmap base pointer is captured once at construction (one call on the underlying +/// ) so mints fresh +/// pointer-backed readers on the merge/scan hot path with no per-call indirection or +/// dispose check. Callers must keep the session alive while any reader derived from it +/// is in use. +/// +public sealed unsafe class WholeReadSession : IDisposable, IByteReaderSource +{ + private readonly ArenaReservation _reservation; + private readonly ArenaFile.MmapWholeView _view; + private readonly byte* _basePtr; + private readonly long _size; + private bool _disposed; + + internal WholeReadSession(ArenaReservation reservation, bool adviseDontNeedOnDispose) + { + _reservation = reservation; + _reservation.AcquireLease(); + _view = _reservation.OpenWholeView(adviseDontNeedOnDispose); + _basePtr = _view.DataPtr; + _size = _view.Size; + } + + /// + /// Materialise a fresh over the session's view, addressed + /// in the reservation's own offset space (offset 0 = first byte). Pointer-backed so >2 GiB + /// reservations are addressable. No dispose check — the caller guarantees the session is alive + /// (see the type remarks); this is the merge/scan hot path. + /// + public WholeReadSessionReader CreateReader() => new(_basePtr, _size); + + public void Dispose() + { + if (_disposed) return; + _disposed = true; + _view.Dispose(); + _reservation.Dispose(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/WholeReadSessionReader.cs b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/WholeReadSessionReader.cs new file mode 100644 index 000000000000..a4b05f4bb63d --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/PersistedSnapshots/Storage/WholeReadSessionReader.cs @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System.Runtime.Intrinsics.X86; +using Nethermind.State.Flat.Io; + +namespace Nethermind.State.Flat.PersistedSnapshots.Storage; + +/// +/// over a 's mmap view. +/// Uses byte* + length to correctly address >2 GiB views; +/// each call constructs an int-sized +/// at the requested offset rather than spanning the whole reservation. +/// +/// The pointer lifetime is owned by the ; the session must remain alive for the duration of any use of this reader. +public readonly unsafe ref struct WholeReadSessionReader(byte* basePtr, long length) : IByteReader +{ + private readonly byte* _basePtr = basePtr; + public long Length => length; + + public bool TryRead(long offset, scoped Span output) + { + if ((ulong)offset + (ulong)output.Length > (ulong)length) return false; + new ReadOnlySpan(_basePtr + offset, output.Length).CopyTo(output); + return true; + } + + public NoOpPin PinBuffer(Bound bound) + { + if ((ulong)bound.Offset + (ulong)bound.Length > (ulong)length) + throw new ArgumentOutOfRangeException(nameof(bound)); + return new NoOpPin(new ReadOnlySpan(_basePtr + bound.Offset, checked((int)bound.Length))); + } + + /// + /// Prefetches the body of a BTree node whose first byte was just read (page + TLB now resident): + /// pulls the two cache lines after the header line so the floor-search's key scan finds them warm. + /// is the node start; line 0 is already cached from the flag-byte read. + /// + public void Prefetch(long offset) + { + if (!Sse.IsSupported || (ulong)offset >= (ulong)length) return; + byte* p = _basePtr + offset; + Sse.Prefetch0(p + 64); + Sse.Prefetch0(p + 128); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/Persistence/BaseTriePersistence.cs b/src/Nethermind/Nethermind.State.Flat/Persistence/BaseTriePersistence.cs index 139d5caf72bb..102f5be4691c 100644 --- a/src/Nethermind/Nethermind.State.Flat/Persistence/BaseTriePersistence.cs +++ b/src/Nethermind/Nethermind.State.Flat/Persistence/BaseTriePersistence.cs @@ -81,13 +81,7 @@ public static class BaseTriePersistence private static ReadOnlySpan EncodeStateTopNodeKey(Span buffer, in TreePath path) { - // Looks like this <3-byte-path> - // Last 4 bit of the path is the length - - path.Path.Bytes[0..StateNodesTopPathLength].CopyTo(buffer); - // Pack length into lower 4 bits of last byte (upper 4 bits contain path data) - byte lengthAsByte = (byte)path.Length; - buffer[StateNodesTopPathLength - 1] = (byte)((buffer[StateNodesTopPathLength - 1] & 0xf0) | (lengthAsByte & 0x0f)); + path.EncodeWith3Byte(buffer); return buffer[..StateNodesTopPathLength]; } diff --git a/src/Nethermind/Nethermind.State.Flat/Persistence/BloomFilter/BloomFilter.cs b/src/Nethermind/Nethermind.State.Flat/Persistence/BloomFilter/BloomFilter.cs index f887592e28b3..58490ee1d68c 100644 --- a/src/Nethermind/Nethermind.State.Flat/Persistence/BloomFilter/BloomFilter.cs +++ b/src/Nethermind/Nethermind.State.Flat/Persistence/BloomFilter/BloomFilter.cs @@ -33,13 +33,13 @@ public sealed unsafe class BloomFilter : IDisposable public long Capacity { get; } public double BitsPerKey { get; } - public int K { get; } + private int K { get; } public long Count => Volatile.Read(ref _count); // Total bloom data bytes (no header), always multiple of 64 bytes public long DataBytes { get; } - public long NumBlocks { get; } // number of 64B cache lines + private long NumBlocks { get; } // number of 64B cache lines private long _count; @@ -121,6 +121,24 @@ public BloomFilter(long capacity, double bitsPerKey, long initialCount = 0) } } + /// + /// Construct a sentinel bloom whose always returns true. + /// + /// + /// Used by the bloom-disabled config path (PersistedSnapshotBloomBitsPerKey == 0 or + /// degenerate capacity-zero builds) to keep downstream APIs non-nullable: every snapshot + /// has a real , and the disabled mode just behaves as + /// "the bloom never filters anything out". One small native allocation (a single 64-byte + /// cache line — the minimum the constructor produces) per call; callers own disposal + /// the same as any other . + /// + public static BloomFilter AlwaysTrue() + { + BloomFilter b = new(capacity: 1, bitsPerKey: 1.0); + new Span(b._data, checked((int)b._dataSize)).Fill(0xFF); + return b; + } + /// /// Returns the 64B cacheline byte offset within the bloom data that was touched. /// @@ -199,8 +217,6 @@ public void Clear() Volatile.Write(ref _count, 0); } - internal byte* DangerousGetDataPointer() => _data; - public void Dispose() { if (Interlocked.Exchange(ref _disposed, 1) != 0) return; diff --git a/src/Nethermind/Nethermind.State.Flat/Persistence/BloomFilter/RefCountedBloomFilter.cs b/src/Nethermind/Nethermind.State.Flat/Persistence/BloomFilter/RefCountedBloomFilter.cs new file mode 100644 index 000000000000..01fb96051396 --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/Persistence/BloomFilter/RefCountedBloomFilter.cs @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core.Utils; + +namespace Nethermind.State.Flat.Persistence.BloomFilter; + +/// +/// Ref-counted owner of a single . The wrapped native filter is disposed — and +/// its contribution to reversed — only once every lease +/// has been released, so one filter can back several s. +/// +/// +/// A large compaction adopts its merged bloom as the (superset) pre-filter of every snapshot it contains: +/// each contained snapshot is re-registered as a twin holding a lease on this wrapper, and the filter +/// survives until the big snapshot and all twins (and their in-flight readers) drain. Keeping the lease +/// count out of leaves that type a pure data structure. +/// +public sealed class RefCountedBloomFilter : SmallRefCountingDisposable +{ + private readonly BloomFilter _filter; + + public RefCountedBloomFilter(BloomFilter filter) + { + _filter = filter; + Interlocked.Add(ref Metrics._persistedSnapshotBloomMemory, filter.DataBytes); + Interlocked.Increment(ref Metrics._persistedSnapshotBloomCount); + } + + /// A freshly-owned sentinel — correct (no false + /// negatives) but unfiltered — for snapshots whose real bloom is built later (the placeholder + /// snapshot is then re-registered carrying that bloom). + public static RefCountedBloomFilter AlwaysTrue() => new(BloomFilter.AlwaysTrue()); + + /// The wrapped filter. Valid for as long as the caller holds a lease on this wrapper. + public BloomFilter Filter => _filter; + + protected override void CleanUp() + { + Interlocked.Add(ref Metrics._persistedSnapshotBloomMemory, -_filter.DataBytes); + Interlocked.Decrement(ref Metrics._persistedSnapshotBloomCount); + _filter.Dispose(); + } +} diff --git a/src/Nethermind/Nethermind.State.Flat/Persistence/IPersistence.cs b/src/Nethermind/Nethermind.State.Flat/Persistence/IPersistence.cs index c4dc81ef093a..5855eefe6111 100644 --- a/src/Nethermind/Nethermind.State.Flat/Persistence/IPersistence.cs +++ b/src/Nethermind/Nethermind.State.Flat/Persistence/IPersistence.cs @@ -40,9 +40,7 @@ public interface IPersistenceReader : IDisposable bool TryGetStorageRaw(in ValueHash256 addrHash, in ValueHash256 slotHash, ref SlotValue value); IFlatIterator CreateAccountIterator(in ValueHash256 startKey, in ValueHash256 endKey); - IFlatIterator CreateAccountIterator() => CreateAccountIterator(ValueKeccak.Zero, ValueKeccak.MaxValue); IFlatIterator CreateStorageIterator(in ValueHash256 accountKey, in ValueHash256 startSlotKey, in ValueHash256 endSlotKey); - IFlatIterator CreateStorageIterator(in ValueHash256 accountKey) => CreateStorageIterator(accountKey, ValueKeccak.Zero, ValueKeccak.MaxValue); bool IsPreimageMode { get; } } diff --git a/src/Nethermind/Nethermind.State.Flat/PersistenceManager.cs b/src/Nethermind/Nethermind.State.Flat/PersistenceManager.cs index 551b9b8cbb00..018218fab56e 100644 --- a/src/Nethermind/Nethermind.State.Flat/PersistenceManager.cs +++ b/src/Nethermind/Nethermind.State.Flat/PersistenceManager.cs @@ -3,6 +3,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; +using Nethermind.Config; using Nethermind.Core; using Nethermind.Core.Attributes; using Nethermind.Core.Collections; @@ -11,8 +12,14 @@ using Nethermind.Int256; using Nethermind.Logging; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.PersistedSnapshots.Storage; using Nethermind.Trie; using Nethermind.Trie.Pruning; +using WholeReadScanner = Nethermind.State.Flat.PersistedSnapshots.PersistedSnapshotScanner< + Nethermind.State.Flat.PersistedSnapshots.Storage.WholeReadSession, + Nethermind.State.Flat.PersistedSnapshots.Storage.WholeReadSessionReader, + Nethermind.State.Flat.Io.NoOpPin>; [assembly: InternalsVisibleTo("Nethermind.State.Flat.Test")] [assembly: InternalsVisibleTo("Nethermind.Synchronization.Test")] @@ -21,19 +28,36 @@ namespace Nethermind.State.Flat; public class PersistenceManager( IFlatDbConfig configuration, - ICompactionSchedule compactionSchedule, + ICompactionSchedule schedule, IFinalizedStateProvider finalizedStateProvider, IPersistence persistence, ISnapshotRepository snapshotRepository, - ILogManager logManager) : IPersistenceManager + ILogManager logManager, + IPersistedSnapshotCompactor compactor, + IPersistedSnapshotLoader loader, + IProcessExitSource processExitSource) : IPersistenceManager, IDisposable { private readonly ILogger _logger = logManager.GetClassLogger(); + // Linked to process exit so the conversion Parallel.ForEach below cancels at shutdown-start — + // before DI disposal order matters — letting the owning FlatDbManager.RunPersistence task drain. + private readonly CancellationTokenSource _cts = CancellationTokenSource.CreateLinkedTokenSource(processExitSource.Token); private readonly int _minReorgDepth = configuration.MinReorgDepth; - private readonly int _maxReorgDepth = configuration.MaxReorgDepth; + private readonly int _maxInMemoryBaseSnapshotCount = configuration.MaxInMemoryBaseSnapshotCount; + // Force-persist backstop depth: the long-finality window when enabled (the persisted tier serves + // deep reorgs), otherwise the smaller non-long-finality MaxReorgDepth. Raised to at least one + // CompactSize above MinReorgDepth so the normal finalized-persistence trigger (which engages around + // MinReorgDepth) always has room to act before the backstop fires. This lets MinReorgDepth be + // configured at or above the backstop without the two thresholds colliding — the backstop is + // adjusted up accordingly. + private readonly int _backstopReorgDepth = Math.Max( + configuration.EnableLongFinality ? configuration.LongFinalityMaxReorgDepth : configuration.MaxReorgDepth, + configuration.MinReorgDepth + configuration.CompactSize); private readonly int _compactSize = configuration.CompactSize; - private readonly ICompactionSchedule _schedule = compactionSchedule; + private readonly bool _enableLongFinality = configuration.EnableLongFinality; private readonly List<(Hash256, TreePath)> _trieNodesSortBuffer = []; // Presort make it faster - private readonly Lock _persistenceLock = new(); + // SemaphoreSlim rather than a Lock: the AddToPersistence drain awaits the compactor's async + // Enqueue while holding the mutex, which a Lock.Scope (a ref struct) cannot span. + private readonly SemaphoreSlim _persistenceLock = new(1, 1); private StateId _currentPersistedStateId = StateId.PreGenesis; @@ -49,137 +73,290 @@ public StateId GetCurrentPersistedStateId() return _currentPersistedStateId; } - private Snapshot? GetFinalizedSnapshotAtBlockNumber(long blockNumber, StateId currentPersistedState, bool compactedSnapshot) + /// + /// Two-phase action: Phase 1 (persistence to RocksDB) runs first; Phase 2 (conversion to + /// the persisted-snapshot tier) runs only when Phase 1 returns no candidate. + /// + /// + /// Phase 1 seed selection — the finalized trigger and the backstop are evaluated independently, + /// the backstop being a fallback rather than an alternative so it stays reachable even when the + /// finalized trigger ran but found nothing to persist: + /// + /// Finalized trigger: if finalizedBlock >= persistedBlock + CompactSize AND + /// snapshotsDepth + CompactSize > MinReorgDepth → seed = canonical state at + /// the next boundary block (persistedBlock + CompactSize). Looked up via + /// — the boundary is always locally synced even + /// during catch-up sync where the CL-reported finalized tip is beyond the chain head. + /// Backstop fallback (if the finalized trigger persisted nothing): if + /// snapshotsDepth > the backstop depth (LongFinalityMaxReorgDepth when long + /// finality is enabled, otherwise MaxReorgDepth, raised to at least + /// MinReorgDepth + CompactSize) → seed = the committed head. + /// Otherwise → no candidate; Phase 1 doesn't run, fall through to Phase 2. + /// + /// Phase 2 runs only with enabled AND + /// SnapshotCount > MaxInMemoryBaseSnapshotCount. + /// + internal (PersistedSnapshot? ToPersistPersistedSnapshot, Snapshot? ToPersist, ConversionCandidate? ToConvert) DetermineSnapshotAction(StateId latestSnapshot) { - Hash256? finalizedStateRoot = finalizedStateProvider.GetFinalizedStateRootAt(blockNumber); - using ArrayPoolList states = snapshotRepository.GetStatesAtBlockNumber(blockNumber); - foreach (StateId stateId in states) - { - if (stateId.StateRoot != finalizedStateRoot) continue; + StateId currentPersistedState = GetCurrentPersistedStateId(); + long snapshotsDepth = latestSnapshot.BlockNumber - currentPersistedState.BlockNumber; - Snapshot? snapshot; - if (compactedSnapshot) - { - if (!snapshotRepository.TryLeaseCompactedState(stateId, out snapshot)) continue; - } - else + // ---- Phase 1: persistence to RocksDB ---- + long finalizedBlockNumber = finalizedStateProvider.FinalizedBlockNumber; + long nextBoundary = schedule.NextFullCompactionAfter(currentPersistedState.BlockNumber); + + // Normal finalized-driven persistence. Anchor at the next boundary block, not at the + // CL-reported finalized tip. The outer gate guarantees boundary <= finalizedBlockNumber, so + // the provider's own range check passes; the boundary is below chain head by construction, so + // the canonical header is in the block tree and FindHeader resolves. + if (finalizedBlockNumber >= nextBoundary + && snapshotsDepth + _compactSize > _minReorgDepth) + { + Hash256? canonicalRoot = finalizedStateProvider.GetFinalizedStateRootAt(nextBoundary); + if (canonicalRoot is not null) { - if (!snapshotRepository.TryLeaseState(stateId, out snapshot)) continue; + (PersistedSnapshot? persisted, Snapshot? inMemory) = snapshotRepository.FindSnapshotToPersist( + new StateId(nextBoundary, canonicalRoot), currentPersistedState, _compactSize); + if (persisted is not null || inMemory is not null) + return (persisted, inMemory, null); } + } - if (snapshot.From == currentPersistedState) + // Force-persist backstop: an independent safety net, NOT an alternative to the finalized + // trigger. It must stay reachable even when the finalized branch ran but produced no + // persistable candidate (e.g. its synthetic boundary seed matched no live snapshot). An + // `else if` here would let the always-satisfied finalized depth gate permanently shadow it + // once MinReorgDepth is configured near the backstop depth, so deep state would never persist. + // Seed from the committed head so the forced persist follows the canonical chain rather than an + // arbitrary/longest fork (which RemoveSiblingAndDescendents would then orphan); fall back to the + // longest chain, then the latest state, only when nothing was committed this session. + if (snapshotsDepth > _backstopReorgDepth) + { + StateId backstopSeed = snapshotRepository.GetLastCommittedStateId() ?? snapshotRepository.GetLastSnapshotId() ?? latestSnapshot; + (PersistedSnapshot? persisted, Snapshot? inMemory) = + snapshotRepository.FindSnapshotToPersist(backstopSeed, currentPersistedState, _compactSize); + if (persisted is not null || inMemory is not null) { - if (_logger.IsDebug) _logger.Debug($"Persisting compacted state {stateId}"); - - return snapshot; + if (_logger.IsWarn) _logger.Warn( + $"In-memory state depth {snapshotsDepth} exceeded the force-persist backstop {_backstopReorgDepth}; " + + $"forcing persistence to bound memory (finalized block {finalizedBlockNumber})."); + return (persisted, inMemory, null); } - - snapshot.Dispose(); } - return null; + // ---- Phase 2: conversion to the persisted-snapshot tier ---- + if (!_enableLongFinality) return (null, null, null); + if (snapshotRepository.SnapshotCount <= _maxInMemoryBaseSnapshotCount) return (null, null, null); + + return (null, null, TryFindSnapshotToConvert(currentPersistedState)); } - private Snapshot? GetHeadAncestorAtBlockNumber(long blockNumber, StateId currentPersistedState, in StateId head, bool compactedSnapshot) + /// + /// Phase 2 — scan in-memory snapshots in ascending block-number order using two passes so + /// boundary-CompactSize compacted candidates (Branch A) globally win over base candidates + /// (Branch B), regardless of block-number ordering. Boundary compacted exist only at + /// multiples of while bases exist at every block, so a + /// single-pass ascending walk would always pick the smallest-block base first and starve + /// the boundary candidates. + /// + /// + /// Both passes share the same ordered list and the same on-disk gate + /// ( — either equals or is + /// the To of an existing persisted base snapshot). Pass 1 keeps the + /// span == _compactSize guard so sub-CompactSize compacted (width 1/2/4/8/16, + /// produced by at non-boundary blocks) cannot be + /// returned as boundary candidates. + /// + private ConversionCandidate? TryFindSnapshotToConvert(StateId currentPersistedState) { - // Pick the state at blockNumber that is the head's ancestor rather than an arbitrary fork, so the - // forced persist follows the chain leading to the head instead of orphaning it. - if (!snapshotRepository.TryFindAncestorStateAtBlock(head, blockNumber, out StateId stateId)) - return null; + using ArrayPoolList ordered = snapshotRepository.GetStatesUpToBlock(long.MaxValue); - Snapshot? snapshot; - if (compactedSnapshot) + // Pass 1 (global): boundary-CompactSize in-memory compacted → Branch A. + foreach (StateId X in ordered) { - if (!snapshotRepository.TryLeaseCompactedState(stateId, out snapshot)) return null; - } - else - { - if (!snapshotRepository.TryLeaseState(stateId, out snapshot)) return null; + if (!snapshotRepository.TryLeaseInMemoryState(X, SnapshotTier.InMemoryCompacted, out Snapshot? compacted)) continue; + + if (compacted!.To.BlockNumber - compacted.From.BlockNumber == _compactSize + && IsOnDisk(compacted.From, currentPersistedState)) + { + return new ConversionCandidate(compacted, Base: null); + } + compacted.Dispose(); } - if (snapshot.From == currentPersistedState) + // Pass 2 (fallback): in-memory base → Branch B. + foreach (StateId X in ordered) { - if (_logger.IsWarn) _logger.Warn($"Force persisting state {stateId}"); + if (!snapshotRepository.TryLeaseInMemoryState(X, SnapshotTier.InMemoryBase, out Snapshot? baseSnap)) continue; - return snapshot; + if (IsOnDisk(baseSnap!.From, currentPersistedState)) + { + return new ConversionCandidate(Compacted: null, baseSnap); + } + baseSnap.Dispose(); } - snapshot.Dispose(); return null; } - internal Snapshot? DetermineSnapshotToPersist(StateId latestSnapshot) + private bool IsOnDisk(in StateId state, in StateId currentPersistedState) => + state == currentPersistedState || snapshotRepository.HasBaseSnapshot(state); + + internal sealed record ConversionCandidate(Snapshot? Compacted, Snapshot? Base); + + public async Task AddToPersistence(StateId latestSnapshot) { - // Actually, the latest compacted snapshot, not the latest snapshot. - long lastSnapshotNumber = latestSnapshot.BlockNumber; + await _persistenceLock.WaitAsync(); + try + { + // Bound the drain per invocation so a deep backlog (e.g. early catch-up sync) does + // not block the processing thread for an unbounded time. The caller re-enters on + // every block, so the remaining backlog is consumed across subsequent invocations. + const int MaxDrainIterations = 4; + for (int i = 0; i < MaxDrainIterations; i++) + { + (PersistedSnapshot? persistedToPersist, Snapshot? toPersist, ConversionCandidate? toConvert) = + DetermineSnapshotAction(latestSnapshot); - StateId currentPersistedState = GetCurrentPersistedStateId(); - long finalizedBlockNumber = finalizedStateProvider.FinalizedBlockNumber; - long inMemoryStateDepth = lastSnapshotNumber - currentPersistedState.BlockNumber; - if (inMemoryStateDepth - _compactSize < _minReorgDepth) + if (toPersist is not null) + { + using Snapshot _ = toPersist; + snapshotRepository.RemoveSiblingAndDescendents(toPersist.To); + PersistSnapshot(toPersist); + _currentPersistedStateId = toPersist.To; + snapshotRepository.RemoveStatesUntil(toPersist.To.BlockNumber); + } + else if (persistedToPersist is not null) + { + using PersistedSnapshot _ = persistedToPersist; + snapshotRepository.RemoveSiblingAndDescendents(persistedToPersist.To); + PersistPersistedSnapshot(persistedToPersist); + _currentPersistedStateId = persistedToPersist.To; + snapshotRepository.RemoveStatesUntil(persistedToPersist.To.BlockNumber); + } + else if (toConvert?.Compacted is not null) + { + await ConvertCompactedRange(toConvert.Compacted); + } + else if (toConvert?.Base is not null) + { + await ConvertSingleBase(toConvert.Base); + } + else + { + break; + } + } + } + finally { - // Keep some state in memory - return null; + _persistenceLock.Release(); } + } - Snapshot? snapshotToPersist; - - long nextCompactedBoundary = _schedule.NextFullCompactionAfter(currentPersistedState.BlockNumber); - if (nextCompactedBoundary > finalizedBlockNumber) + /// + /// Branch A — boundary CompactSize compacted: convert every in-memory base in the range it + /// spans and queue them for batched compaction. The CompactSized snapshot is produced by the + /// batched compactor (a linked merge of the bases), not here, so the compacted in-memory + /// snapshot is used only to delimit the block range. Disposes . + /// + private async Task ConvertCompactedRange(Snapshot compacted) + { + try { - if (inMemoryStateDepth <= _maxReorgDepth) + long start = compacted.From.BlockNumber + 1; + long end = compacted.To.BlockNumber; + + ArrayPoolList allStateIds = new(64); + for (long b = start; b <= end; b++) { - // Unfinalized, and still under max reorg depth - return null; + using ArrayPoolList statesAtBlock = snapshotRepository.GetStatesAtBlockNumber(b); + foreach (StateId state in statesAtBlock) + allStateIds.Add(state); } - if (_logger.IsWarn) _logger.Warn($"Very long unfinalized state. Force persisting to conserve memory. finalized block number is {finalizedBlockNumber}."); - // Follow the committed head; fall back to the longest chain when nothing was committed this session. - StateId head = snapshotRepository.GetLastCommittedStateId() ?? snapshotRepository.GetLastSnapshotId() ?? latestSnapshot; - snapshotToPersist = GetHeadAncestorAtBlockNumber(nextCompactedBoundary, currentPersistedState, head, true) ?? - GetHeadAncestorAtBlockNumber(currentPersistedState.BlockNumber + 1, currentPersistedState, head, false); - } - else - { - snapshotToPersist = GetFinalizedSnapshotAtBlockNumber(nextCompactedBoundary, currentPersistedState, true) ?? - GetFinalizedSnapshotAtBlockNumber(currentPersistedState.BlockNumber + 1, currentPersistedState, false); - } + Parallel.ForEach( + allStateIds, + new ParallelOptions { CancellationToken = _cts.Token }, + state => + { + if (snapshotRepository.TryLeaseInMemoryState(state, SnapshotTier.InMemoryBase, out Snapshot? snap)) + { + long sw = Stopwatch.GetTimestamp(); + loader.ConvertAndRegister(snap); + Metrics.PersistedSnapshotConvertTime.Observe(Stopwatch.GetTimestamp() - sw); + snap.Dispose(); + } + }); + + // Remove exactly the converted in-memory snapshots — not RemoveStatesUntil(end), + // which would also drop snapshots added concurrently within the block range. Must + // run before the channel handoff below: the compactor takes ownership of + // allStateIds and disposes it. + foreach (StateId state in allStateIds) + { + // A To can exist in both in-memory tiers — remove from each. + snapshotRepository.RemoveAndReleaseInMemoryKnownState(state, SnapshotTier.InMemoryCompacted); + snapshotRepository.RemoveAndReleaseInMemoryKnownState(state, SnapshotTier.InMemoryBase); + } - if (snapshotToPersist is null) + await compactor.EnqueueAsync(allStateIds, GetCurrentPersistedStateId().BlockNumber, _cts.Token); + } + finally { - if (_logger.IsWarn) _logger.Warn($"Unable to find snapshot to persist. Current persisted state {currentPersistedState}. Compact size {_compactSize}."); + compacted.Dispose(); } - - return snapshotToPersist; } - public void AddToPersistence(StateId latestSnapshot) + /// + /// Branch B — single base convert (fragmented case: no full-CompactSize compacted available + /// for the candidate range yet). Disposes . + /// + private async Task ConvertSingleBase(Snapshot baseSnap) { - using Lock.Scope scope = _persistenceLock.EnterScope(); - // Attempt to add snapshots into bigcache - while (true) + try { - Snapshot? snapshotToSave = DetermineSnapshotToPersist(latestSnapshot); - - if (snapshotToSave is null) return; - using Snapshot _ = snapshotToSave; // dispose + long sw = Stopwatch.GetTimestamp(); + loader.ConvertAndRegister(baseSnap); + Metrics.PersistedSnapshotConvertTime.Observe(Stopwatch.GetTimestamp() - sw); - snapshotRepository.RemoveSiblingAndDescendents(snapshotToSave.To); + ArrayPoolList single = new(1) { baseSnap.To }; + await compactor.EnqueueAsync(single, GetCurrentPersistedStateId().BlockNumber, _cts.Token); - // Add the canon snapshot - PersistSnapshot(snapshotToSave); - _currentPersistedStateId = snapshotToSave.To; + snapshotRepository.RemoveAndReleaseInMemoryKnownState(baseSnap.To, SnapshotTier.InMemoryBase); + } + finally + { + baseSnap.Dispose(); } } /// - /// Force persist all snapshots regardless of finalization status. - /// Used by FlushCache to ensure all state is persisted before clearing caches. + /// Walk and persist every snapshot up to the current tip, ignoring the finality gate, and return + /// the resulting persisted state. /// + /// + /// Called only by the genesis loader (via FlatDbManager.FlushCache), for sync compatibility: + /// it advances the persisted RocksDB state all the way to the tip and prunes both tiers behind it, + /// leaving only the persisted state that the sync pipeline reads directly. Unlike + /// it has no per-call drain bound and seeds the walk from the + /// finalized state when available, falling back to the in-memory then tier-aware latest tip. + /// public StateId FlushToPersistence() { - using Lock.Scope scope = _persistenceLock.EnterScope(); + _persistenceLock.Wait(); + try + { + return FlushToPersistenceLocked(); + } + finally + { + _persistenceLock.Release(); + } + } + private StateId FlushToPersistenceLocked() + { StateId currentPersistedState = GetCurrentPersistedStateId(); // Follow the committed head; fall back to the longest chain when nothing was committed this session. StateId? latestStateId = snapshotRepository.GetLastCommittedStateId() ?? snapshotRepository.GetLastSnapshotId(); @@ -189,47 +366,50 @@ public StateId FlushToPersistence() return currentPersistedState; } - // Persist all snapshots from current persisted state to latest + // Persist all snapshots from current persisted state to latest. Flush ignores the + // finality gate but still prefers the finalized state as the BFS seed when one is + // available — that biases the walk onto the canonical chain. Falls back to the committed + // head (then the longest chain) when no finalized state root is exposed, which also covers + // a persisted-only backlog after the in-memory tier has been drained. while (currentPersistedState.BlockNumber < latestStateId.Value.BlockNumber) { - long nextCompactedBoundary = _schedule.NextFullCompactionAfter(currentPersistedState.BlockNumber); - - // Try finalized snapshots first (compacted, then non-compacted) - Snapshot? snapshotToPersist = GetFinalizedSnapshotAtBlockNumber( - nextCompactedBoundary, - currentPersistedState, - compactedSnapshot: true); - - snapshotToPersist ??= GetFinalizedSnapshotAtBlockNumber( - currentPersistedState.BlockNumber + 1, - currentPersistedState, - compactedSnapshot: false); - - // Fall back to the head's chain if finalized not available - snapshotToPersist ??= GetHeadAncestorAtBlockNumber( - nextCompactedBoundary, - currentPersistedState, - latestStateId.Value, - compactedSnapshot: true); - - snapshotToPersist ??= GetHeadAncestorAtBlockNumber( - currentPersistedState.BlockNumber + 1, - currentPersistedState, - latestStateId.Value, - compactedSnapshot: false); - - if (snapshotToPersist is null) + StateId? seed = null; + long finalizedBlockNumber = finalizedStateProvider.FinalizedBlockNumber; + if (finalizedBlockNumber > currentPersistedState.BlockNumber) { - break; + Hash256? finalizedStateRoot = finalizedStateProvider.GetFinalizedStateRootAt(finalizedBlockNumber); + if (finalizedStateRoot is not null) + seed = new StateId(finalizedBlockNumber, finalizedStateRoot); } + // Fall back to the committed head (latestStateId folds in GetLastCommittedStateId, then the + // longest chain) so the forced walk follows the canonical chain rather than a longer + // non-canonical fork, and still covers a persisted-only backlog once the in-memory tier drains. + seed ??= latestStateId; + if (seed is null) break; - using Snapshot _ = snapshotToPersist; + (PersistedSnapshot? persisted, Snapshot? snapshotToPersist) = + snapshotRepository.FindSnapshotToPersist(seed.Value, currentPersistedState, _compactSize); - snapshotRepository.RemoveSiblingAndDescendents(snapshotToPersist.To); + if (persisted is not null) + { + using PersistedSnapshot persistedScope = persisted; + snapshotRepository.RemoveSiblingAndDescendents(persisted.To); + PersistPersistedSnapshot(persisted); + _currentPersistedStateId = persisted.To; + currentPersistedState = _currentPersistedStateId; + snapshotRepository.RemoveStatesUntil(persisted.To.BlockNumber); + continue; + } + + if (snapshotToPersist is null) break; + + using Snapshot inMemScope = snapshotToPersist; + snapshotRepository.RemoveSiblingAndDescendents(snapshotToPersist.To); PersistSnapshot(snapshotToPersist); _currentPersistedStateId = snapshotToPersist.To; currentPersistedState = _currentPersistedStateId; + snapshotRepository.RemoveStatesUntil(snapshotToPersist.To.BlockNumber); } return currentPersistedState; @@ -241,6 +421,12 @@ public void ResetPersistedStateId() _currentPersistedStateId = reader.CurrentState; } + public void Dispose() + { + _cts.Dispose(); + _persistenceLock.Dispose(); + } + internal void PersistSnapshot(Snapshot snapshot) { long compactLength = snapshot.To.BlockNumber! - snapshot.From.BlockNumber!; @@ -339,4 +525,58 @@ internal void PersistSnapshot(Snapshot snapshot) Metrics.FlatPersistenceTime.Observe(Stopwatch.GetTimestamp() - sw); } + + internal void PersistPersistedSnapshot(PersistedSnapshot snapshot) + { + long sw = Stopwatch.GetTimestamp(); + + // A linked CompactSized's NodeRefs scatter across the base snapshots' blob arenas, so + // the table scan below reads blobs out of order. Prefetch every base's contiguous RLP + // region up front so the kernel can stream them in as bulk read-ahead; once the + // CompactSized is written the same regions are dropped from the page cache (below) — + // they won't be read again. The leases are held for the whole method. + using PersistedSnapshotList bases = snapshotRepository.LeaseBaseSnapshotsInRange(snapshot.From, snapshot.To); + foreach (PersistedSnapshot baseSnapshot in bases) + baseSnapshot.AdviseWillNeedBlobRange(); + + using WholeReadSession session = snapshot.BeginWholeReadSession(); + WholeReadScanner scanner = PersistedSnapshotScanner.ForWholeRead(session, snapshot); + using (IPersistence.IWriteBatch batch = persistence.CreateWriteBatch(snapshot.From, snapshot.To)) + { + // Self-destruct + account share the account column (0xFE); slots have their own column + // (0xFD). Walk the per-address pass first so every SelfDestruct precedes every SetStorage: + // a self-destruct clears prior storage, and the post-destruct slots are re-applied below. + foreach (WholeReadScanner.PerAddressEntry entry in scanner.PerAddresses) + { + if (entry.SelfDestructFlag is false) + batch.SelfDestruct(entry.Address); + + if (entry.HasAccount) + batch.SetAccount(entry.Address, entry.Account); + } + + // Slots stream sorted by address, so materialize one Address per address-run, not per slot. + Address? slotAddress = null; + foreach (WholeReadScanner.SlotEntry slot in scanner.Slots) + { + if (slotAddress is null || !slot.AddressSpan.SequenceEqual(slotAddress.Bytes)) + slotAddress = slot.Address; + batch.SetStorage(slotAddress, slot.Slot, slot.Value); + } + + foreach (WholeReadScanner.StateNodeEntry entry in scanner.StateNodes) + batch.SetStateTrieNode(entry.Path, entry.Rlp); + + foreach (WholeReadScanner.StorageNodeEntry entry in scanner.StorageNodes) + batch.SetStorageTrieNode(entry.AddressHash.ToCommitment(), entry.Path, entry.Rlp); + } + + // The CompactSized is now in RocksDB — drop the prefetched base blob ranges from the + // page cache rather than leaving them hot until the base snapshots are pruned. + foreach (PersistedSnapshot baseSnapshot in bases) + baseSnapshot.AdviseDontNeedBlobRange(); + + Metrics.FlatPersistenceTime.Observe(Stopwatch.GetTimestamp() - sw); + } + } diff --git a/src/Nethermind/Nethermind.State.Flat/ReadOnlySnapshotBundle.cs b/src/Nethermind/Nethermind.State.Flat/ReadOnlySnapshotBundle.cs index 14746a5368e2..3fc7d9e8ef1e 100644 --- a/src/Nethermind/Nethermind.State.Flat/ReadOnlySnapshotBundle.cs +++ b/src/Nethermind/Nethermind.State.Flat/ReadOnlySnapshotBundle.cs @@ -11,6 +11,7 @@ using Nethermind.Core.Utils; using Nethermind.Int256; using Nethermind.State.Flat.Persistence; +using Nethermind.State.Flat.PersistedSnapshots; using Nethermind.Trie; namespace Nethermind.State.Flat; @@ -21,10 +22,15 @@ namespace Nethermind.State.Flat; public sealed class ReadOnlySnapshotBundle( SnapshotPooledList snapshots, IPersistence.IPersistenceReader persistenceReader, - bool recordDetailedMetrics) + bool recordDetailedMetrics, + PersistedSnapshotStack persistedSnapshots) : RefCountingDisposable { - public int SnapshotCount => snapshots.Count; + // Cached once — the persisted-snapshot stack is immutable for the bundle's lifetime. Every read + // gates its persisted-tier probe on this being > 0, so a node with no persisted snapshots (e.g. + // long finality disabled, or none persisted yet) skips the persisted lookups entirely. + private readonly int _persistedSnapshotCount = persistedSnapshots.Count; + public int SnapshotCount => _persistedSnapshotCount + snapshots.Count; private bool _isDisposed; private static readonly StringLabel _readAccountSnapshotLabel = new("account_snapshot"); @@ -54,6 +60,9 @@ public sealed class ReadOnlySnapshotBundle( } } + if (_persistedSnapshotCount > 0 && persistedSnapshots.TryGetAccount(address, out Account? persistedAccount)) + return persistedAccount; + sw = recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; Account? account = persistenceReader.GetAccount(address); if (account == null) @@ -74,12 +83,10 @@ public int DetermineSelfDestructSnapshotIdx(Address address) for (int i = snapshots.Count - 1; i >= 0; i--) { if (snapshots[i].HasSelfDestruct(key)) - { - return i; - } + return _persistedSnapshotCount + i; } - return -1; + return _persistedSnapshotCount > 0 && persistedSnapshots.TryGetSelfDestruct(address, out int snapshotIdx) ? snapshotIdx : -1; } public byte[]? GetSlot(Address address, in UInt256 index, int selfDestructStateIdx) => @@ -89,6 +96,7 @@ public int DetermineSelfDestructSnapshotIdx(Address address) { GuardDispose(); + (Address address, UInt256 index) = key.Key; long sw = recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; for (int i = snapshots.Count - 1; i >= 0; i--) { @@ -99,21 +107,24 @@ public int DetermineSelfDestructSnapshotIdx(Address address) return res; } - if (i <= selfDestructStateIdx) + if (_persistedSnapshotCount + i <= selfDestructStateIdx) { return null; } } + if (_persistedSnapshotCount > 0 && persistedSnapshots.TryGetSlot(address, in index, selfDestructStateIdx, sw, out byte[]? persistedSlot)) + return persistedSlot; + SlotValue outSlotValue = new(); sw = recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; persistenceReader.TryGetSlot(key.Key.Item1, key.Key.Item2, ref outSlotValue); - byte[]? value = outSlotValue.ToEvmBytes(); + byte[]? slotResult = outSlotValue.ToEvmBytes(); if (recordDetailedMetrics) { - if (value is null || value.IsZero()) + if (slotResult is null || slotResult.IsZero()) { Metrics.ReadOnlySnapshotBundleTimes.Observe(Stopwatch.GetTimestamp() - sw, _readStoragePersistenceNullLabel); } @@ -123,7 +134,7 @@ public int DetermineSelfDestructSnapshotIdx(Address address) } } - return value; + return slotResult; } public bool TryFindStateNodes(in TreePath path, Hash256 hash, [NotNullWhen(true)] out TrieNode? node) => @@ -176,6 +187,9 @@ public bool TryFindStorageNodes(HashedKey<(Hash256, TreePath)> key, [NotNullWhen { GuardDispose(); + if (_persistedSnapshotCount > 0 && persistedSnapshots.TryLoadStateRlp(in path, out byte[]? persistedRlp)) + return persistedRlp; + Nethermind.Trie.Pruning.Metrics.LoadedFromDbNodesCount++; long sw = recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; byte[]? value = persistenceReader.TryLoadStateRlp(path, flags); @@ -188,6 +202,9 @@ public bool TryFindStorageNodes(HashedKey<(Hash256, TreePath)> key, [NotNullWhen { GuardDispose(); + if (_persistedSnapshotCount > 0 && persistedSnapshots.TryLoadStorageRlp(address, in path, out byte[]? persistedRlp)) + return persistedRlp; + Nethermind.Trie.Pruning.Metrics.LoadedFromDbNodesCount++; long sw = recordDetailedMetrics ? Stopwatch.GetTimestamp() : 0; byte[]? value = persistenceReader.TryLoadStorageRlp(address, path, flags); @@ -208,6 +225,7 @@ protected override void CleanUp() if (Interlocked.CompareExchange(ref _isDisposed, true, false)) return; snapshots.Dispose(); + persistedSnapshots.Dispose(); // Null them in case unexpected mutation from trie warmer persistenceReader.Dispose(); diff --git a/src/Nethermind/Nethermind.State.Flat/Snapshot.cs b/src/Nethermind/Nethermind.State.Flat/Snapshot.cs index 7f4a43c7ffe9..a720233a9729 100644 --- a/src/Nethermind/Nethermind.State.Flat/Snapshot.cs +++ b/src/Nethermind/Nethermind.State.Flat/Snapshot.cs @@ -29,7 +29,8 @@ ResourcePool.Usage usage ) : RefCountingDisposable { public long EstimateMemory() => content.EstimateMemory(); - public ResourcePool.Usage Usage => usage; + // Test-only observability (SnapshotCompactorTests); not consumed by production. + internal ResourcePool.Usage Usage => usage; public StateId From => from; public StateId To => to; @@ -86,12 +87,12 @@ public void Reset() } public long EstimateMemory() => - // ConcurrentDictionary entry overhead ~48 bytes for Accounts/Storages/SelfDestruct - Accounts.Count * 172 + // Key (12B: ref 8B + hash 4B) + Value ref (8B) + CD overhead (48) + Account object (~104B) - Storages.Count * 136 + // Key (44B: addr ref 8B + UInt256 32B + hash 4B) + Value (40B SlotValue?) + CD overhead (48) + Value ref (4B) - SelfDestructedStorageAddresses.Count * 64 + // Key (12B: ref 8B + hash 4B) + Value (4B) + CD overhead (48) - StateNodes.Count * (NodeSizeEstimate + 76) + // Key (40B: TreePath 36B + hash 4B) + Value ref (8B) + dictionary overhead (28) + TrieNode - StorageNodes.Count * (NodeSizeEstimate + 84); // Key (48B: Hash256 ref 8B + TreePath 36B + hash 4B) + Value ref (8B) + dictionary overhead (28) + TrieNode + // Cast Count to long before multiplying to avoid int overflow for large snapshots + (long)Accounts.Count * 172 + // Key (12B: ref 8B + hash 4B) + Value ref (8B) + CD overhead (48) + Account object (~104B) + (long)Storages.Count * 136 + // Key (44B: addr ref 8B + UInt256 32B + hash 4B) + Value (40B SlotValue?) + CD overhead (48) + Value ref (4B) + (long)SelfDestructedStorageAddresses.Count * 64 + // Key (12B: ref 8B + hash 4B) + Value (4B) + CD overhead (48) + (long)StateNodes.Count * (NodeSizeEstimate + 76) + // Key (40B: TreePath 36B + hash 4B) + Value ref (8B) + dictionary overhead (28) + TrieNode + (long)StorageNodes.Count * (NodeSizeEstimate + 84); // Key (48B: Hash256 ref 8B + TreePath 36B + hash 4B) + Value ref (8B) + dictionary overhead (28) + TrieNode /// /// Estimates memory for compacted snapshots, counting only dictionary overhead + keys + value-type values. diff --git a/src/Nethermind/Nethermind.State.Flat/SnapshotCompactor.cs b/src/Nethermind/Nethermind.State.Flat/SnapshotCompactor.cs index 78c4e23b7b58..4d3c75d60b9b 100644 --- a/src/Nethermind/Nethermind.State.Flat/SnapshotCompactor.cs +++ b/src/Nethermind/Nethermind.State.Flat/SnapshotCompactor.cs @@ -29,18 +29,17 @@ public class SnapshotCompactor( public bool DoCompactSnapshot(in StateId stateId) { - if (_snapshotRepository.TryLeaseState(stateId, out Snapshot? snapshot)) + if (_snapshotRepository.TryLeaseInMemoryState(stateId, SnapshotTier.InMemoryBase, out Snapshot? snapshot)) { - using Snapshot _ = snapshot; // dispose + using Snapshot _ = snapshot; - // Actually do the compaction long sw = Stopwatch.GetTimestamp(); using SnapshotPooledList snapshots = GetSnapshotsToCompact(snapshot); if (snapshots.Count != 0) { Snapshot compactedSnapshot = CompactSnapshotBundle(snapshots); - if (_snapshotRepository.TryAddCompactedSnapshot(compactedSnapshot)) + if (_snapshotRepository.TryAdd(compactedSnapshot, SnapshotTier.InMemoryCompacted)) { Metrics.CompactTime.Observe(Stopwatch.GetTimestamp() - sw); @@ -70,14 +69,14 @@ public SnapshotPooledList GetSnapshotsToCompact(Snapshot snapshot) // Save memory by removing the compacted state from previous compaction foreach (StateId id in _snapshotRepository.GetStatesAtBlockNumber(blockNumber - _compactSize)) { - if (_snapshotRepository.RemoveAndReleaseCompactedKnownState(id)) + if (_snapshotRepository.RemoveAndReleaseInMemoryKnownState(id, SnapshotTier.InMemoryCompacted)) { } } } long startingBlockNumber = blockNumber - compactSize; - SnapshotPooledList snapshots = _snapshotRepository.AssembleSnapshotsUntil(snapshot.To, startingBlockNumber, compactSize); + SnapshotPooledList snapshots = _snapshotRepository.AssembleInMemorySnapshotsForCompaction(snapshot.To, startingBlockNumber, compactSize); bool snapshotsOk = false; try diff --git a/src/Nethermind/Nethermind.State.Flat/SnapshotRepository.cs b/src/Nethermind/Nethermind.State.Flat/SnapshotRepository.cs index 605503add067..97634636415a 100644 --- a/src/Nethermind/Nethermind.State.Flat/SnapshotRepository.cs +++ b/src/Nethermind/Nethermind.State.Flat/SnapshotRepository.cs @@ -9,16 +9,41 @@ using Nethermind.Core.Crypto; using Nethermind.Core.Extensions; using Nethermind.Core.Threading; +using Nethermind.Db; using Nethermind.Logging; +using Nethermind.State.Flat.PersistedSnapshots; +using Nethermind.State.Flat.PersistedSnapshots.Storage; +using Nethermind.State.Flat.Persistence.BloomFilter; namespace Nethermind.State.Flat; -public class SnapshotRepository(ILogManager logManager) : ISnapshotRepository +/// +/// Owns both tiers: the in-memory snapshots (base + compacted dictionaries) and the persisted tier +/// (four s over the arena/blob/catalog stores). Two-tier graph +/// walks, persistence, and compaction-assembly live here so they operate on the buckets directly. +/// +public class SnapshotRepository : ISnapshotRepository, IDisposable { - private readonly ILogger _logger = logManager.GetClassLogger(); - + private readonly ILogger _logger; + + // ---- Persisted tier: four buckets keyed by StateId.To. Each bucket is self-contained and + // individually-locked. A `To` can live in more than one bucket (a base and a compacted snapshot + // can share it). + private readonly ISnapshotCatalog _catalog; + private readonly int _compactSize; + private readonly PersistedSnapshotBucket _base; + private readonly PersistedSnapshotBucket _smallCompacted; + private readonly PersistedSnapshotBucket _largeCompacted; + private readonly PersistedSnapshotBucket _compactSized; + private int _disposed; + + // ---- In-memory tier: only the recent unpersisted snapshots (bounded by + // MaxInMemoryBaseSnapshotCount). Aggregates are kept as running totals at the TryAdd* / + // RemoveAndRelease* sites rather than via ConcurrentDictionary.Count. private readonly ConcurrentDictionary _compactedSnapshots = new(); private readonly ConcurrentDictionary _snapshots = new(); + private long _snapshotCount; + private long _compactedSnapshotCount; private readonly ReadWriteLockBox> _sortedSnapshotStateIds = new([]); // StateId is larger than a machine word, so its read/write across threads must be synchronized. @@ -26,127 +51,127 @@ public class SnapshotRepository(ILogManager logManager) : ISnapshotRepository private StateId _lastCommittedStateId; private bool _hasLastCommitted; - public int SnapshotCount => _snapshots.Count; - public int CompactedSnapshotCount => _compactedSnapshots.Count; + public SnapshotRepository( + IArenaManager arenaManager, + BlobArenaManager blobArenaManager, + ISnapshotCatalog catalog, + IFlatDbConfig config, + ILogManager logManager) + { + _catalog = catalog; + _logger = logManager.GetClassLogger(); + _base = new PersistedSnapshotBucket(_catalog, SnapshotTier.PersistedBase, _logger); + _smallCompacted = new PersistedSnapshotBucket(_catalog, SnapshotTier.PersistedSmallCompacted, _logger); + _largeCompacted = new PersistedSnapshotBucket(_catalog, SnapshotTier.PersistedLargeCompacted, _logger); + _compactSized = new PersistedSnapshotBucket(_catalog, SnapshotTier.PersistedCompactSized, _logger); + _compactSize = config.CompactSize; + } + + public int SnapshotCount => (int)Interlocked.Read(ref _snapshotCount); + // Test-only; not part of ISnapshotRepository. + internal int CompactedSnapshotCount => (int)Interlocked.Read(ref _compactedSnapshotCount); + + public int PersistedSnapshotCount => (int)(_base.Count + _smallCompacted.Count + _largeCompacted.Count + _compactSized.Count); public void AddStateId(in StateId stateId) { - using ReadWriteLockBox>.Lock _ = _sortedSnapshotStateIds.EnterWriteLock(out SortedSet sortedSnapshots); - sortedSnapshots.Add(stateId); + using (_sortedSnapshotStateIds.EnterWriteLock(out SortedSet sortedSnapshots)) + sortedSnapshots.Add(stateId); } - public SnapshotPooledList AssembleSnapshots(in StateId baseBlock, in StateId targetState, int estimatedSize) - => baseBlock == targetState - ? SnapshotPooledList.Empty() - : AssembleSnapshotsBfs(baseBlock, targetState.BlockNumber, targetState, estimatedSize); + public AssembledSnapshotResult AssembleSnapshots(in StateId baseBlock, in StateId targetState, int estimatedSize) + { + if (baseBlock == targetState) return new AssembledSnapshotResult(SnapshotPooledList.Empty(), PersistedSnapshotList.Empty()); - public SnapshotPooledList AssembleSnapshotsUntil(in StateId baseBlock, long minBlockNumber, int estimatedSize) - => AssembleSnapshotsBfs(baseBlock, minBlockNumber, exactTarget: null, estimatedSize); + AssemblePolicy policy = new(targetState); + return WalkAndAssemble(baseBlock, estimatedSize, ref policy); + } /// /// BFS over the snapshot graph from back toward - /// , returning the snapshots along the winning path in ascending - /// order (result[0].From is the terminus, result[^1].To == baseBlock). Returns an - /// empty list when no path reaches the terminus. + /// , returning the in-memory snapshots along the winning path in + /// ascending order (result[0].From is the terminus, result[^1].To == baseBlock). + /// Empty when no path reaches the terminus. /// /// - /// Each StateId node has up to 2 edges, explored widest-jump first - the in-memory compacted - /// snapshot, then the in-memory base snapshot. Edges dropping below - /// are pruned, so a wide compacted jump that overshoots is discarded in favour of the narrower base - /// edge. The path wins at the first node reaching ; when - /// is supplied that node must also equal it (used to assemble a path - /// to a specific state), otherwise any state at that block number qualifies (used to gather a window - /// for compaction). `visited` owns a lease on every leased snapshot; the winning path is re-leased - /// before the finally releases all of them. + /// Each node has up to 2 edges, explored widest-jump first (compacted, then base). Edges dropping + /// below are pruned, so an overshooting compacted jump yields to + /// the narrower base edge. Wins at the first node reaching . /// - private SnapshotPooledList AssembleSnapshotsBfs(in StateId baseBlock, long minBlockNumber, StateId? exactTarget, int estimatedSize) + public SnapshotPooledList AssembleInMemorySnapshotsForCompaction(in StateId baseBlock, long minBlockNumber, int estimatedSize) { - using ArrayPoolListRef<(Snapshot Snapshot, int ParentIndex)> visited = new(estimatedSize); - using PooledQueue<(StateId Current, int ParentIndex)> queue = new(); - using PooledSet seen = new(); - try - { - queue.Enqueue((baseBlock, -1)); - seen.Add(baseBlock); - int winnerIndex = -1; - - while (queue.Count > 0 && winnerIndex < 0) - { - (StateId current, int parentIndex) = queue.Dequeue(); - - for (int edge = 0; edge < 2; edge++) - { - Snapshot? snapshot; - if (edge == 0) - { - if (!TryLeaseCompactedState(current, out snapshot)) continue; - } - else - { - if (!TryLeaseState(current, out snapshot)) continue; - } - - StateId from = snapshot.From; - - if (from.BlockNumber < minBlockNumber || !seen.Add(from)) - { - snapshot.Dispose(); - continue; - } - - int index = visited.Count; - visited.Add((snapshot, parentIndex)); - - if (from.BlockNumber == minBlockNumber && (exactTarget is not StateId target || from == target)) - { - winnerIndex = index; - break; - } + InMemoryCompactionPolicy policy = new(minBlockNumber); + AssembledSnapshotResult result = WalkAndAssemble(baseBlock, estimatedSize, ref policy); + result.Persisted.Dispose(); // in-memory-only policy yields no persisted entries + return result.InMemory; + } - queue.Enqueue((from, index)); - } - } + /// + /// Find the next snapshot to flush — the valid persist candidate directly extending + /// (its From equals it). Returns the leased persisted + /// or in-memory snapshot (caller disposes), or (null, null) when none is reachable. Used by + /// both persistence phases in . + /// + /// + /// Runs with , navigating + /// From-edges from down toward + /// and winning at the first candidate edge reaching it. Non-candidate tiers are traversed as + /// skip-pointers. The winning candidate is the chain's terminus; this re-leases just that snapshot + /// and drops the rest. + /// + public (PersistedSnapshot? Persisted, Snapshot? InMemory) FindSnapshotToPersist( + in StateId seed, in StateId currentPersistedState, int compactSize) + { + if (seed.BlockNumber <= currentPersistedState.BlockNumber) return (null, null); - if (winnerIndex < 0) return SnapshotPooledList.Empty(); + int estimatedSize = (int)Math.Clamp(seed.BlockNumber - currentPersistedState.BlockNumber, 4, 4096); + FindPersistPolicy policy = new(currentPersistedState, compactSize); + using AssembledSnapshotResult result = WalkAndAssemble(seed, estimatedSize, ref policy); - // Walk winner -> root: yields ascending order directly (result[0].From == terminus, - // result[^1].To == baseBlock). - SnapshotPooledList result = new(estimatedSize); - for (int walk = winnerIndex; walk >= 0; walk = visited[walk].ParentIndex) - { - // `visited` still holds a lease, so re-acquire cannot fail; assert flags future - // Snapshot lifecycle changes that could break this invariant. - bool acquired = visited[walk].Snapshot.TryAcquire(); - Debug.Assert(acquired, "TryAcquire failed despite held lease"); - result.Add(visited[walk].Snapshot); - } - return result; + // Candidate is the chain terminus (oldest); re-lease it and let the `using` drop the rest. The + // in-mem-before-persisted invariant puts a persisted candidate at Persisted[0], in-memory at InMemory[0]. + if (result.Persisted.Count > 0) + { + PersistedSnapshot persisted = result.Persisted[0]; + persisted.TryAcquire(); + return (persisted, null); } - finally + if (result.InMemory.Count > 0) { - for (int i = 0; i < visited.Count; i++) - { - visited[i].Snapshot.Dispose(); - } + Snapshot inMemory = result.InMemory[0]; + inMemory.TryAcquire(); + return (null, inMemory); } + return (null, null); } - public bool TryLeaseCompactedState(in StateId stateId, [NotNullWhen(true)] out Snapshot? entry) + /// + /// Best-effort backward BFS over the persisted tier from , returning the + /// contiguous chain reaching the deepest block >= + /// (oldest-first). Need not be fully populated; empty when fewer than two snapshots are found. + /// + public PersistedSnapshotList AssemblePersistedSnapshotsForCompaction(in StateId toStateId, long minBlockNumber) { - SpinWait sw = new(); - while (_compactedSnapshots.TryGetValue(stateId, out entry)) - { - if (entry.TryAcquire()) return true; + int estimatedSize = (int)Math.Clamp(toStateId.BlockNumber - minBlockNumber, 4, 4096); + PersistedCompactionPolicy policy = new(minBlockNumber); + AssembledSnapshotResult result = WalkAndAssemble(toStateId, estimatedSize, ref policy); + result.InMemory.Dispose(); // persisted-only policy yields no in-memory entries - sw.SpinOnce(); + PersistedSnapshotList persisted = result.Persisted; + if (persisted.Count < 2) + { + persisted.Dispose(); + return PersistedSnapshotList.Empty(); } - return false; + return persisted; } - public bool TryLeaseState(in StateId stateId, [NotNullWhen(true)] out Snapshot? entry) + public bool TryLeaseInMemoryState(in StateId stateId, SnapshotTier tier, [NotNullWhen(true)] out Snapshot? entry) { + tier.EnsureInMemory(); + ConcurrentDictionary snapshots = tier == SnapshotTier.InMemoryBase ? _snapshots : _compactedSnapshots; SpinWait sw = new(); - while (_snapshots.TryGetValue(stateId, out entry)) + while (snapshots.TryGetValue(stateId, out entry)) { if (entry.TryAcquire()) return true; @@ -155,31 +180,34 @@ public bool TryLeaseState(in StateId stateId, [NotNullWhen(true)] out Snapshot? return false; } - public bool TryAddCompactedSnapshot(Snapshot snapshot) + public bool TryAdd(Snapshot snapshot, SnapshotTier tier) { - if (_compactedSnapshots.TryAdd(snapshot.To, snapshot)) + tier.EnsureInMemory(); + if (tier == SnapshotTier.InMemoryBase) { - Metrics.CompactedSnapshotCount++; + if (_snapshots.TryAdd(snapshot.To, snapshot)) + { + Interlocked.Increment(ref _snapshotCount); + Metrics.SnapshotCount++; - long compactedBytes = snapshot.Content.EstimateCompactedMemory(); - Metrics.CompactedSnapshotMemory += compactedBytes; - Metrics.TotalSnapshotMemory += compactedBytes; + long totalBytes = snapshot.EstimateMemory(); + Metrics.SnapshotMemory += totalBytes; + Metrics.TotalSnapshotMemory += totalBytes; - return true; - } + return true; + } - return false; - } + return false; + } - public bool TryAddSnapshot(Snapshot snapshot) - { - if (_snapshots.TryAdd(snapshot.To, snapshot)) + if (_compactedSnapshots.TryAdd(snapshot.To, snapshot)) { - Metrics.SnapshotCount++; + Interlocked.Increment(ref _compactedSnapshotCount); + Metrics.CompactedSnapshotCount++; - long totalBytes = snapshot.EstimateMemory(); - Metrics.SnapshotMemory += totalBytes; - Metrics.TotalSnapshotMemory += totalBytes; + long compactedBytes = snapshot.Content.EstimateCompactedMemory(); + Metrics.CompactedSnapshotMemory += compactedBytes; + Metrics.TotalSnapshotMemory += compactedBytes; return true; } @@ -209,10 +237,24 @@ private bool HasForkAt(long blockNumber) public StateId? GetLastSnapshotId() { - using ReadWriteLockBox>.Lock _ = _sortedSnapshotStateIds.EnterReadLock(out SortedSet sortedSnapshots); - return sortedSnapshots.Count == 0 ? null : sortedSnapshots.Max; + StateId? max; + using (_sortedSnapshotStateIds.EnterReadLock(out SortedSet sortedSnapshots)) + max = sortedSnapshots.Count == 0 ? null : sortedSnapshots.Max; + + // Persisted tips aren't in `_sortedSnapshotStateIds`, and after a reorg the persisted tier can hold + // an orphan above the in-memory tip — so fold the persisted maxima in for the true cross-tier max + // that callers (flush bound, orphan-walk bound) need. + // Regression: RemoveSiblingAndDescendents_PersistedOrphanAboveInMemoryTip_IsPruned. + max = MaxState(max, _base.Max); + max = MaxState(max, _smallCompacted.Max); + max = MaxState(max, _largeCompacted.Max); + max = MaxState(max, _compactSized.Max); + return max; } + private static StateId? MaxState(StateId? a, StateId? b) => + a is null ? b : b is null ? a : a.Value.CompareTo(b.Value) >= 0 ? a : b; + public void SetLastCommittedStateId(in StateId stateId) { using Lock.Scope _ = _lastCommittedLock.EnterScope(); @@ -226,123 +268,138 @@ public void SetLastCommittedStateId(in StateId stateId) return _hasLastCommitted ? _lastCommittedStateId : null; } - public bool TryFindAncestorStateAtBlock(in StateId head, long blockNumber, out StateId ancestor) + public bool RemoveAndReleaseInMemoryKnownState(in StateId stateId, SnapshotTier tier) { - if (head.BlockNumber < blockNumber) + tier.EnsureInMemory(); + if (tier == SnapshotTier.InMemoryCompacted) { - ancestor = default; - return false; - } - - if (head.BlockNumber == blockNumber) - { - ancestor = head; - return true; - } - - using SnapshotPooledList path = AssembleSnapshotsUntil(head, blockNumber, estimatedSize: 16); // BFS initial capacity hint - if (path.Count == 0) - { - ancestor = default; - return false; - } - - // result[0].From is the terminus: the state at blockNumber on the head's chain. - ancestor = path[0].From; - return true; - } + if (_compactedSnapshots.TryRemove(stateId, out Snapshot? existingState)) + { + Interlocked.Decrement(ref _compactedSnapshotCount); + Metrics.CompactedSnapshotCount--; - public bool RemoveAndReleaseCompactedKnownState(in StateId stateId) - { - if (_compactedSnapshots.TryRemove(stateId, out Snapshot? existingState)) - { - Metrics.CompactedSnapshotCount--; + long compactedBytes = existingState.Content.EstimateCompactedMemory(); + Metrics.CompactedSnapshotMemory -= compactedBytes; + Metrics.TotalSnapshotMemory -= compactedBytes; - long compactedBytes = existingState.Content.EstimateCompactedMemory(); - Metrics.CompactedSnapshotMemory -= compactedBytes; - Metrics.TotalSnapshotMemory -= compactedBytes; + existingState.Dispose(); - existingState.Dispose(); + return true; + } - return true; + return false; } - return false; - } - - public void RemoveAndReleaseKnownState(StateId stateId) - { - if (_snapshots.TryRemove(stateId, out Snapshot? existingState)) + if (_snapshots.TryRemove(stateId, out Snapshot? existing)) { + Interlocked.Decrement(ref _snapshotCount); Metrics.SnapshotCount--; using (_sortedSnapshotStateIds.EnterWriteLock(out SortedSet sortedSnapshots)) - { sortedSnapshots.Remove(stateId); - } - long totalBytes = existingState.EstimateMemory(); + long totalBytes = existing.EstimateMemory(); Metrics.SnapshotMemory -= totalBytes; Metrics.TotalSnapshotMemory -= totalBytes; - existingState.Dispose(); // After memory + existing.Dispose(); + + return true; } + + return false; } - public bool HasState(in StateId stateId) => _snapshots.ContainsKey(stateId); + public bool HasState(in StateId stateId) + { + if (_snapshots.ContainsKey(stateId)) return true; + if (HasBaseSnapshot(stateId)) return true; + return false; + } - public ArrayPoolList GetSnapshotBeforeStateId(StateId stateId) + public ArrayPoolList GetStatesUpToBlock(long blockNumber) { - if (stateId.BlockNumber < 0) + if (blockNumber < 0) return ArrayPoolList.Empty(); using ReadWriteLockBox>.Lock _ = _sortedSnapshotStateIds.EnterReadLock(out SortedSet sortedSnapshots); return sortedSnapshots - .GetViewBetween(new StateId(0, Hash256.Zero), new StateId(stateId.BlockNumber, Keccak.MaxValue)) + .GetViewBetween(new StateId(0, Hash256.Zero), new StateId(blockNumber, Keccak.MaxValue)) .ToPooledList(0); } - public void RemoveStatesUntil(in StateId currentPersistedStateId) + public void RemoveStatesUntil(long blockNumber) { - using ArrayPoolList statesBeforeStateId = GetSnapshotBeforeStateId(currentPersistedStateId); - foreach (StateId stateToRemove in statesBeforeStateId) + using ArrayPoolList statesUpToBlock = GetStatesUpToBlock(blockNumber); + foreach (StateId stateToRemove in statesUpToBlock) { - RemoveAndReleaseCompactedKnownState(stateToRemove); - RemoveAndReleaseKnownState(stateToRemove); + // A To can live in both in-memory tiers — remove from each. + RemoveAndReleaseInMemoryKnownState(stateToRemove, SnapshotTier.InMemoryCompacted); + RemoveAndReleaseInMemoryKnownState(stateToRemove, SnapshotTier.InMemoryBase); } + + // A persist also supersedes the persisted tier: drop persisted snapshots strictly below the block + // (the base at the persisted block stays as a read/compaction source until the state advances past + // it). Unified here so callers don't pair this with a separate persisted-tier call. + RemovePersistedStatesUntil(blockNumber); } private const int PruneBatchSize = 1000; public void RemoveSiblingAndDescendents(in StateId canonicalStateId) { - // Fast-fail when the persisted block has no sibling state: nothing above it can be orphaned. - if (!HasForkAt(canonicalStateId.BlockNumber)) return; + long canonicalBlock = canonicalStateId.BlockNumber; - StateId? lastStateId = GetLastSnapshotId(); - if (lastStateId is null || lastStateId.Value.BlockNumber <= canonicalStateId.BlockNumber) return; + // Fast-fail when the block has no sibling in either tier: with a single state at the block, + // everything above it chains down through the canonical one, so nothing above can be orphaned. + // A non-canonical sibling may live in-memory or — if converted before the reorg pruned it — in + // the persisted tier. + if (!HasForkAt(canonicalBlock) && !HasPersistedForkAt(canonicalStateId)) return; - long maxBlock = lastStateId.Value.BlockNumber; - long batchStart = canonicalStateId.BlockNumber + 1; - int totalPruned = 0; + // Bound the orphan walk by the highest block in either tier. GetLastSnapshotId folds in the + // persisted tips, covering a persisted orphan above the in-memory tip (DoConvert moves a + // converted range into the persisted tier and drops it from in-memory). + long maxBlock = GetLastSnapshotId()?.BlockNumber ?? long.MinValue; + if (maxBlock <= canonicalBlock) return; - using PooledStack stack = new(); - using PooledSet seen = new(); + long batchStart = canonicalBlock + 1; + int totalPruned = 0; while (batchStart <= maxBlock) { long batchEnd = Math.Min(batchStart + PruneBatchSize - 1, maxBlock); - using ArrayPoolListRef batch = GetStatesInRange(batchStart, batchEnd); - foreach (StateId stateId in batch) + + // In-memory orphans above the persisted block. + using (ArrayPoolListRef inMemory = GetStatesInRange(batchStart, batchEnd)) { - if (!CanReachState(stateId, canonicalStateId, stack, seen)) + foreach (StateId stateId in inMemory) { - RemoveAndReleaseCompactedKnownState(stateId); - RemoveAndReleaseKnownState(stateId); - totalPruned++; + if (!CanReachState(stateId, canonicalStateId)) + { + // A To can live in both in-memory tiers — remove from each. + RemoveAndReleaseInMemoryKnownState(stateId, SnapshotTier.InMemoryCompacted); + RemoveAndReleaseInMemoryKnownState(stateId, SnapshotTier.InMemoryBase); + totalPruned++; + } } } + + // Persisted-tier orphans above the persisted block — e.g. non-canonical siblings converted + // into the tier (DoConvert applies no canonicality filter) before the reorg orphaned them, + // unreachable by the in-memory pass above. + using (ArrayPoolList persisted = GetPersistedStatesInRange(batchStart, batchEnd)) + { + foreach (StateId stateId in persisted) + { + if (!CanReachState(stateId, canonicalStateId) + && RemovePersistedStateExact(stateId)) + { + totalPruned++; + } + } + } + batchStart = batchEnd + 1; } @@ -352,40 +409,62 @@ public void RemoveSiblingAndDescendents(in StateId canonicalStateId) } } - private bool CanReachState(in StateId from, in StateId target, PooledStack stack, PooledSet seen) + /// True when the persisted tier holds a non-canonical state at + /// 's block — a fork the canonical persist orphans. + private bool HasPersistedForkAt(in StateId canonicalStateId) + { + using ArrayPoolList atBlock = + GetPersistedStatesInRange(canonicalStateId.BlockNumber, canonicalStateId.BlockNumber); + foreach (StateId stateId in atBlock) + if (stateId != canonicalStateId) return true; + return false; + } + + /// + /// Walks parent (From) edges from toward + /// across both tiers. Crossing into the persisted tier is required so a canonical in-memory state + /// whose ancestry descends through a converted snapshot is not mistaken for an orphan. + /// + private bool CanReachState(in StateId from, in StateId target) { if (from == target) return true; if (from.BlockNumber <= target.BlockNumber) return false; - stack.Clear(); - seen.Clear(); - stack.Push(from); + // Order-independent reachability, so a stack DFS suffices; each lease is read for its From then + // disposed immediately. Same hardcoded in-mem-cannot-follow-persisted invariant as WalkAndAssemble. + using PooledStack stack = new(); + using PooledSet seen = new(); seen.Add(from); + stack.Push(new WalkNode(from, viaPersisted: false, -1)); + // Expansion order (same as AssemblePolicy): widest skip first for the shortest reachable chain. + ReadOnlySpan edgePriority = + [SnapshotTier.PersistedLargeCompacted, SnapshotTier.PersistedCompactSized, SnapshotTier.InMemoryCompacted, SnapshotTier.InMemoryBase, SnapshotTier.PersistedSmallCompacted, SnapshotTier.PersistedBase]; while (stack.Count > 0) { - StateId current = stack.Pop(); - - for (int edge = 0; edge < 2; edge++) + WalkNode node = stack.Pop(); + foreach (SnapshotTier tier in edgePriority) { - Snapshot? snapshot; - if (edge == 0) + if (node.ViaPersisted && !tier.IsPersisted()) continue; + + IDisposable snapshot; + StateId parentFrom; + if (tier.IsPersisted()) { - if (!TryLeaseCompactedState(current, out snapshot)) continue; + if (!TryLeasePersistedState(node.Current, tier, out PersistedSnapshot? persisted)) continue; + (snapshot, parentFrom) = (persisted, persisted.From); } else { - if (!TryLeaseState(current, out snapshot)) continue; + if (!TryLeaseInMemoryState(node.Current, tier, out Snapshot? inMemory)) continue; + (snapshot, parentFrom) = (inMemory, inMemory.From); } - StateId parent = snapshot.From; snapshot.Dispose(); - if (parent == target) return true; - if (parent.BlockNumber > target.BlockNumber && seen.Add(parent)) - { - stack.Push(parent); - } + if (parentFrom == target) return true; + if (parentFrom.BlockNumber > target.BlockNumber && seen.Add(parentFrom)) + stack.Push(new WalkNode(parentFrom, tier.IsPersisted(), -1)); } } return false; @@ -403,4 +482,425 @@ private ArrayPoolListRef GetStatesInRange(long blockStartInclusive, lon foreach (StateId stateId in view) result.Add(stateId); return result; } + + // ===================== Persisted tier ===================== + + /// + /// Index a caller-built into the bucket for , + /// acquiring the bucket's lease under its lock so a racing prune can't dispose it mid-insert. The + /// caller retains and disposes its construction lease, and owns the catalog entry — a freshly + /// persisted/compacted snapshot writes one; a snapshot reloaded from the catalog does not. + /// + public void AddPersistedSnapshot(PersistedSnapshot snapshot, SnapshotTier tier) + { + if (_logger.IsDebug) _logger.Debug($"Created persisted snapshot {tier} {snapshot.From.BlockNumber}->{snapshot.To.BlockNumber} ({snapshot.Size} bytes)"); + BucketFor(tier).Add(snapshot.To, snapshot); + } + + /// + public bool ReplacePersistedSnapshot(in StateId to, PersistedSnapshot replacement, SnapshotTier tier) => + BucketFor(tier).Replace(to, replacement); + + /// + /// Lease the persisted snapshot ending at from the bucket for + /// (must be a Persisted* value). Caller disposes the lease. + /// + public bool TryLeasePersistedState(in StateId toState, SnapshotTier tier, [NotNullWhen(true)] out PersistedSnapshot? snapshot) => tier switch + { + SnapshotTier.PersistedBase => TryLeaseFrom(_base, toState, out snapshot), + SnapshotTier.PersistedSmallCompacted => TryLeaseFrom(_smallCompacted, toState, out snapshot), + SnapshotTier.PersistedLargeCompacted => TryLeaseFrom(_largeCompacted, toState, out snapshot), + SnapshotTier.PersistedCompactSized => TryLeaseFrom(_compactSized, toState, out snapshot), + _ => throw new ArgumentOutOfRangeException(nameof(tier), tier, "Only persisted tiers are valid here."), + }; + + private static bool TryLeaseFrom(PersistedSnapshotBucket bucket, in StateId toState, [NotNullWhen(true)] out PersistedSnapshot? snapshot) + { + if (bucket.TryGet(toState, out snapshot) && snapshot.TryAcquire()) + return true; + snapshot = null; + return false; + } + + /// The bucket for a persisted tier — a 1:1 map. + private PersistedSnapshotBucket BucketFor(SnapshotTier tier) => tier switch + { + SnapshotTier.PersistedBase => _base, + SnapshotTier.PersistedSmallCompacted => _smallCompacted, + SnapshotTier.PersistedLargeCompacted => _largeCompacted, + SnapshotTier.PersistedCompactSized => _compactSized, + _ => throw new ArgumentOutOfRangeException(nameof(tier), tier, "Only persisted tiers are valid here."), + }; + + /// + /// Lease every base snapshot tiling (from, to], walking From pointers back from + /// . Bulk-prefetches the base blob-RLP regions before a linked CompactSized is + /// scanned. Best-effort — stops at the first gap. Caller disposes the returned list. + /// + public PersistedSnapshotList LeaseBaseSnapshotsInRange(StateId from, StateId to) + { + PersistedSnapshotList result = new(0); + StateId current = to; + while (current != from && current.BlockNumber > from.BlockNumber) + { + if (!_base.TryGet(current, out PersistedSnapshot? snapshot) || !snapshot.TryAcquire()) + break; + result.Add(snapshot); + if (snapshot.From == current) + break; // self-loop guard + current = snapshot.From; + } + return result; + } + + /// + public void ShareBloomAcrossRange(StateId from, StateId to, RefCountedBloomFilter sharedBloom, BlobArenaManager blobs) + { + StateId current = to; + while (current.BlockNumber > from.BlockNumber) + { + // Advance pointer is the base chain only: a compacted snapshot's From can dip below `from`, + // and following it would walk out of the window. A gap in the base chain simply stops the + // walk (the unreached snapshots keep their own bloom — correct, just less memory reclaimed). + if (!_base.TryGet(current, out PersistedSnapshot? baseSnap)) break; + StateId baseParent = baseSnap.From; // From is immutable; safe to read without a lease + + // At this block, every bucket may hold a snapshot ending here; share the contained ones. + ShareBloomAt(current, from, to, sharedBloom, blobs, SnapshotTier.PersistedBase); + ShareBloomAt(current, from, to, sharedBloom, blobs, SnapshotTier.PersistedSmallCompacted); + ShareBloomAt(current, from, to, sharedBloom, blobs, SnapshotTier.PersistedLargeCompacted); + ShareBloomAt(current, from, to, sharedBloom, blobs, SnapshotTier.PersistedCompactSized); + + if (baseParent == current) break; // self-loop guard + current = baseParent; + } + } + + /// + /// Re-register the snapshot ending at in 's bucket as a + /// twin over the same reservation carrying a lease on , so its own + /// bloom is freed once it drains. Skips the snapshot already on the shared bloom and any extending + /// below (whose keys the shared bloom does not cover — sharing it would + /// produce false negatives). + /// + private void ShareBloomAt(in StateId at, in StateId from, in StateId to, + RefCountedBloomFilter sharedBloom, BlobArenaManager blobs, SnapshotTier tier) + { + // Lease before reading the entry's fields so it cannot drain mid-build; the twin takes its own + // reservation + blob leases in its ctor, so it is independent of this probe lease. + if (!TryLeasePersistedState(at, tier, out PersistedSnapshot? s)) return; + using (s) + { + if (ReferenceEquals(s.BloomRef, sharedBloom)) return; // the big snapshot itself / already shared + if (s.From.BlockNumber < from.BlockNumber) return; // extends below window → not a subset + if (s.To.BlockNumber > to.BlockNumber) return; // belt-and-suspenders (true on a backward walk) + sharedBloom.AcquireLease(); + using PersistedSnapshot twin = new(s.From, s.To, s.Reservation, blobs, tier, sharedBloom); + // false on a racing prune → twin's `using` drops the cloned bloom lease, self-healing. + ReplacePersistedSnapshot(at, twin, tier); + } + } + + /// + /// Prune persisted snapshots with To.BlockNumber before the given block. Blob arenas referenced by + /// surviving compacted snapshots stay alive via the refcount — no + /// explicit "referenced base id" check is needed here. + /// + public void RemovePersistedStatesUntil(long blockNumber) + { + _base.PruneBefore(blockNumber); + _smallCompacted.PruneBefore(blockNumber); + _largeCompacted.PruneBefore(blockNumber); + _compactSized.PruneBefore(blockNumber); + } + + /// + /// Enumerate persisted To-StateIds across all buckets whose To.BlockNumber is in + /// [startBlockInclusive, endBlockInclusive], deduped. Caller disposes the returned list. + /// + private ArrayPoolList GetPersistedStatesInRange(long startBlockInclusive, long endBlockInclusive) + { + if (endBlockInclusive < startBlockInclusive) return ArrayPoolList.Empty(); + + StateId min = new(startBlockInclusive, ValueKeccak.Zero); + StateId max = new(endBlockInclusive, ValueKeccak.MaxValue); + + // A `To` can live in more than one bucket, so dedupe across the block-ordered sets. + HashSet union = []; + _base.CollectRange(min, max, union); + _smallCompacted.CollectRange(min, max, union); + _largeCompacted.CollectRange(min, max, union); + _compactSized.CollectRange(min, max, union); + + ArrayPoolList result = new(union.Count); + foreach (StateId to in union) result.Add(to); + return result; + } + + /// + /// Remove the persisted snapshot(s) at exactly from every bucket it + /// appears in, releasing their leases. Returns true when anything was removed. + /// + // `|` (not `||`): every bucket must be attempted — a `To` can appear in more than one. + public bool RemovePersistedStateExact(in StateId toState) => + _base.RemoveExact(toState) | _smallCompacted.RemoveExact(toState) | _largeCompacted.RemoveExact(toState) | _compactSized.RemoveExact(toState); + + public bool HasBaseSnapshot(in StateId stateId) => _base.ContainsKey(stateId); + + public IEnumerable PersistedSnapshots + { + get + { + foreach (PersistedSnapshot snap in _base.Snapshots) yield return snap; + foreach (PersistedSnapshot snap in _smallCompacted.Snapshots) yield return snap; + foreach (PersistedSnapshot snap in _largeCompacted.Snapshots) yield return snap; + foreach (PersistedSnapshot snap in _compactSized.Snapshots) yield return snap; + } + } + + public void MarkPersistedTierForShutdown() + { + // Mark every loaded snapshot's files as shutdown-preserved before any teardown. Snapshots + // pruned earlier this session aren't in the buckets, so their files won't get the flag and are + // deleted when the arena/blob managers are disposed. Must complete for every bucket before + // Dispose tears any bucket down — a file shared between a base and a compacted snapshot must be + // flagged before either is disposed. + _base.PersistAllOnShutdown(); + _smallCompacted.PersistAllOnShutdown(); + _largeCompacted.PersistAllOnShutdown(); + _compactSized.PersistAllOnShutdown(); + } + + public void Dispose() + { + if (Interlocked.Exchange(ref _disposed, 1) != 0) return; + + // Dispose snapshots (drops reservation + blob leases) and roll back each bucket's metrics share. + // Files self-clean as their refcount hits zero; the preserve flag from MarkPersistedTierForShutdown + // keeps the on-disk file for opt-in snapshots. + _base.DisposeAndClear(); + _smallCompacted.DisposeAndClear(); + _largeCompacted.DisposeAndClear(); + _compactSized.DisposeAndClear(); + } + + // ---- Backward-walk infrastructure ---- + // Per-edge policies and the shared chain-gathering driver for the Assemble* / CanReach / + // FindSnapshotToPersist walks above; grouped here to keep the public surface uncluttered. Each policy + // inlines its own edge-priority order. The driver hardcodes the invariant that once an edge crosses + // into the persisted tier the in-memory tiers are unreachable. + + private readonly struct WalkNode(in StateId current, bool viaPersisted, int parentIndex) + { + public readonly StateId Current = current; + public readonly bool ViaPersisted = viaPersisted; + public readonly int ParentIndex = parentIndex; + } + + /// Per-edge verdict returned by . + private enum AssembleStep + { + /// Drop this edge — don't traverse it or count it as a winner. + Skip, + /// Follow the edge and keep searching; not a winner. + Traverse, + /// Mark current best winner but keep walking — a deeper edge may still win. The last + /// before the frontier drains is the final winner. + Win, + /// Mark the winner and stop immediately. + WinAndStop, + } + + /// + /// Per-edge policy for : the edge-priority table and a per-edge + /// verdict. The driver owns storage, lease handling, cycle detection, winner + /// tracking, and reconstruction; the policy only inspects each candidate parent edge. + /// + private interface IAssemblePolicy + { + ReadOnlySpan EdgePriority { get; } + /// Verdict for one parent edge: is the node being expanded, + /// is the parent it reaches over . + AssembleStep Decide(in StateId to, in StateId from, SnapshotTier tier); + } + + // Full dual-tier walk for AssembleSnapshots. The driver enforces the in-mem-cannot-follow-persisted + // invariant, so this only filters by block: an overshooting persisted snapshot is the terminal + // element, an overshooting in-memory edge is unusable, and reaching the target exactly wins (a + // different state at the target's block is a sibling fork, skipped). + private readonly struct AssemblePolicy(StateId target) : IAssemblePolicy + { + // Expansion order, widest skip first, so a read assembles the shortest chain: large-compacted + // (>CompactSize), CompactSized, in-memory hops, then narrow small-compacted and persisted bases. + public ReadOnlySpan EdgePriority => + [SnapshotTier.PersistedLargeCompacted, SnapshotTier.PersistedCompactSized, SnapshotTier.InMemoryCompacted, SnapshotTier.InMemoryBase, SnapshotTier.PersistedSmallCompacted, SnapshotTier.PersistedBase]; + + public AssembleStep Decide(in StateId to, in StateId from, SnapshotTier tier) + { + if (from.BlockNumber < target.BlockNumber) + return tier.IsPersisted() ? AssembleStep.WinAndStop : AssembleStep.Skip; + if (from == target) return AssembleStep.WinAndStop; + // A different state at the target's block is a sibling fork — don't win there. + return from.BlockNumber == target.BlockNumber ? AssembleStep.Skip : AssembleStep.Traverse; + } + } + + // In-memory-only walk for AssembleInMemorySnapshotsForCompaction: widest-jump first, pruning edges + // below minBlockNumber, winning at the first node reaching it. + private readonly struct InMemoryCompactionPolicy(long minBlockNumber) : IAssemblePolicy + { + public ReadOnlySpan EdgePriority => [SnapshotTier.InMemoryCompacted, SnapshotTier.InMemoryBase]; + + public AssembleStep Decide(in StateId to, in StateId from, SnapshotTier tier) => + from.BlockNumber < minBlockNumber ? AssembleStep.Skip + : from.BlockNumber == minBlockNumber ? AssembleStep.WinAndStop + : AssembleStep.Traverse; + } + + // Best-effort persisted-only compaction walk: prunes edges overshooting minBlockNumber and wins on + // the deepest (lowest-block) node reached. Widest-first + BFS gives the widest path to each depth; + // the window need not be fully populated. + private struct PersistedCompactionPolicy(long minBlockNumber) : IAssemblePolicy + { + private long _winnerBlock = long.MaxValue; + + public readonly ReadOnlySpan EdgePriority => + [SnapshotTier.PersistedLargeCompacted, SnapshotTier.PersistedCompactSized, SnapshotTier.PersistedSmallCompacted, SnapshotTier.PersistedBase]; + + public AssembleStep Decide(in StateId to, in StateId from, SnapshotTier tier) + { + if (from.BlockNumber < minBlockNumber) return AssembleStep.Skip; + if (from.BlockNumber == minBlockNumber) return AssembleStep.WinAndStop; // window start, deepest possible + if (from.BlockNumber < _winnerBlock) + { + _winnerBlock = from.BlockNumber; + return AssembleStep.Win; + } + return AssembleStep.Traverse; + } + } + + // FindSnapshotToPersist navigation: walk From-edges down to currentPersistedState, winning at the first + // edge reaching it that spans at most CompactSize. The >CompactSize large-compacted is a navigation-only + // skip-pointer (followed above the target, never won onto it). Dedup runs only on retained edges, so a + // skipped edge can't shadow the real candidate edge to the same target. + private readonly struct FindPersistPolicy(StateId currentPersistedState, int compactSize) : IAssemblePolicy + { + // LargeCompacted (>CompactSize) leads as a navigation-only skip-pointer; the rest are candidates, + // CompactSized (the ==CompactSize boundary unit) first. + public ReadOnlySpan EdgePriority => + [SnapshotTier.PersistedLargeCompacted, SnapshotTier.PersistedCompactSized, SnapshotTier.InMemoryCompacted, SnapshotTier.PersistedSmallCompacted, SnapshotTier.InMemoryBase, SnapshotTier.PersistedBase]; + + public AssembleStep Decide(in StateId to, in StateId from, SnapshotTier tier) + { + if (from == currentPersistedState) + // Any chunk spanning at most CompactSize is persistable; a wider large-compacted is skip-only. + return to.BlockNumber - from.BlockNumber <= compactSize ? AssembleStep.WinAndStop : AssembleStep.Skip; + return from.BlockNumber > currentPersistedState.BlockNumber ? AssembleStep.Traverse : AssembleStep.Skip; + } + } + + /// + /// Backward BFS over parent (From) edges, gathering the winning chain into an + /// (in-memory + persisted lists, oldest-first). Owns the + /// frontier queue, visited buffer, cycle detection, winner tracking, and reconstruction. Hardcodes + /// the invariant that once an edge crosses into the persisted tier the in-memory tiers are + /// unreachable. The supplies the edge-priority table and per-edge verdict. + /// + private AssembledSnapshotResult WalkAndAssemble(in StateId start, int estimatedSize, ref TPolicy policy) + where TPolicy : struct, IAssemblePolicy + { + using PooledQueue queue = new(); + using PooledSet seen = new(); + // visited owns a lease on every retained edge; GatherChain re-leases the winning path before the + // finally releases all of them. + ArrayPoolList<(IDisposable snapshot, int parentIndex)> visited = new(estimatedSize); + try + { + int winnerIndex = -1; + seen.Add(start); + // Root starts in-memory; ViaPersisted flips on as the walk crosses a persisted edge. A + // persisted-only policy simply has no in-memory tiers to expand. + queue.Enqueue(new WalkNode(start, viaPersisted: false, -1)); + + while (queue.Count > 0) + { + WalkNode node = queue.Dequeue(); + + foreach (SnapshotTier tier in policy.EdgePriority) + { + // Invariant: a node reached over a persisted edge chains only to persisted tiers. + if (node.ViaPersisted && !tier.IsPersisted()) continue; + + IDisposable snapshot; + StateId from; + if (tier.IsPersisted()) + { + if (!TryLeasePersistedState(node.Current, tier, out PersistedSnapshot? persisted)) continue; + (snapshot, from) = (persisted, persisted.From); + } + else + { + if (!TryLeaseInMemoryState(node.Current, tier, out Snapshot? inMemory)) continue; + (snapshot, from) = (inMemory, inMemory.From); + } + + AssembleStep step = policy.Decide(node.Current, from, tier); + if (step == AssembleStep.Skip) { snapshot.Dispose(); continue; } + // Cycle detection — dedup AFTER Decide so a skipped (non-candidate) edge doesn't claim + // its target and shadow a later candidate edge to the same node. No-op for policies + // whose verdict is constant per node. + if (!seen.Add(from)) { snapshot.Dispose(); continue; } + + int idx = visited.Count; + visited.Add((snapshot, node.ParentIndex)); + if (step != AssembleStep.Traverse) winnerIndex = idx; // Win or WinAndStop + if (step == AssembleStep.WinAndStop) return GatherChain(visited, winnerIndex, estimatedSize); + + queue.Enqueue(new WalkNode(from, tier.IsPersisted(), idx)); + } + } + + return GatherChain(visited, winnerIndex, estimatedSize); + } + finally + { + for (int i = 0; i < visited.Count; i++) visited[i].snapshot.Dispose(); + visited.Dispose(); + } + } + + /// + /// Reconstruct the winner→root path into oldest-first in-memory + persisted lists, re-leasing each + /// snapshot so it survives the caller's release of the visited buffer. The winner is the terminus + /// (oldest), and the in-mem-before-persisted invariant keeps each tier contiguous, so both lists come + /// out ascending without a reversal. Empty lists when no winner was found. + /// + private static AssembledSnapshotResult GatherChain( + ArrayPoolList<(IDisposable snapshot, int parentIndex)> visited, int winnerIndex, int estimatedSize) + { + if (winnerIndex < 0) + return new AssembledSnapshotResult(SnapshotPooledList.Empty(), PersistedSnapshotList.Empty()); + + SnapshotPooledList inMemory = new(estimatedSize); + PersistedSnapshotList persisted = new(estimatedSize); + for (int walk = winnerIndex; walk >= 0; walk = visited[walk].parentIndex) + { + switch (visited[walk].snapshot) + { + case PersistedSnapshot ps: + // visited still holds a lease, so re-acquire cannot fail. + bool pAcquired = ps.TryAcquire(); + Debug.Assert(pAcquired, "TryAcquire failed despite held lease"); + persisted.Add(ps); + break; + case Snapshot s: + bool sAcquired = s.TryAcquire(); + Debug.Assert(sAcquired, "TryAcquire failed despite held lease"); + inMemory.Add(s); + break; + } + } + return new AssembledSnapshotResult(inMemory, persisted); + } } diff --git a/src/Nethermind/Nethermind.State.Flat/SnapshotTier.cs b/src/Nethermind/Nethermind.State.Flat/SnapshotTier.cs new file mode 100644 index 000000000000..da93c8980b7a --- /dev/null +++ b/src/Nethermind/Nethermind.State.Flat/SnapshotTier.cs @@ -0,0 +1,77 @@ +// SPDX-FileCopyrightText: 2025 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using Nethermind.Core.Metric; + +namespace Nethermind.State.Flat; + +/// +/// A snapshot's tier in the two-tier snapshot DAG, spanning the in-memory and persisted tiers. +/// Used as the parameter that selects which store a snapshot operation targets, as the parent-edge +/// classification driving the backward graph walk, and as the on-disk catalog discriminator (only +/// the four Persisted* values are ever serialized — in-memory snapshots have no catalog entry). +/// +/// +/// The numeric order is NOT a priority order: traversal priority is expressed by explicit arrays in +/// SnapshotRepository, decoupled from these values. The order is chosen only so that +/// tier >= PersistedBase is exactly "is persisted". Values fit in a single byte and are +/// cast to/from at the catalog serialization boundary. +/// +public enum SnapshotTier +{ + /// In-memory base — narrow in-RAM hop, no disk read. + InMemoryBase, + + /// In-memory compacted — widest in-RAM hop, no disk read. + InMemoryCompacted, + + /// Persisted base — sub-CompactSize, narrowest persisted hop. Owns a contiguous blob region. + PersistedBase, + + /// Persisted small compacted — sub-CompactSize intermediate merges. References base blob arenas. + PersistedSmallCompacted, + + /// The CompactSize-wide snapshot written to RocksDB. + PersistedCompactSized, + + /// Persisted large compacted — a >CompactSize merge produced at a large-compaction + /// boundary. The widest persisted skip-pointer. References base blob arenas. + PersistedLargeCompacted, +} + +public static class SnapshotTierExtensions +{ + public static bool IsPersisted(this SnapshotTier tier) => tier >= SnapshotTier.PersistedBase; + + /// The metric "tier" label (base/smallcompacted/compactsized/largecompacted) for a persisted + /// . Throws for in-memory tiers, which have no persisted-snapshot metrics. + public static string MetricTierLabel(this SnapshotTier tier) => tier switch + { + SnapshotTier.PersistedBase => "base", + SnapshotTier.PersistedSmallCompacted => "smallcompacted", + SnapshotTier.PersistedCompactSized => "compactsized", + SnapshotTier.PersistedLargeCompacted => "largecompacted", + _ => throw new ArgumentOutOfRangeException(nameof(tier), tier, "Not a persisted tier."), + }; + + /// Guards the in-memory-only operations: throws when is persisted. + public static void EnsureInMemory(this SnapshotTier tier) + { + if (tier.IsPersisted()) + throw new ArgumentOutOfRangeException(nameof(tier), tier, "Only in-memory tiers are valid here."); + } +} + +/// Metric key for the per-(tier, size) persisted-snapshot gauges. Size is the +/// snapshot's block span (To - From) — i.e. its compact size. +public readonly record struct PersistedSnapshotLabel(string Tier, long Size) : IMetricLabels +{ + public string[] Labels => [Tier, Size.ToString()]; +} + +/// Metric key for the per-compact-size persisted-snapshot compaction histograms. Size +/// is the actual compacted block span rounded up to the next power of two. +public readonly record struct CompactSizeLabel(int Size) : IMetricLabels +{ + public string[] Labels => [$"size{Size}"]; +} diff --git a/src/Nethermind/Nethermind.Trie.Test/TreePathTests.cs b/src/Nethermind/Nethermind.Trie.Test/TreePathTests.cs index 54c04729af59..7224e24e581f 100644 --- a/src/Nethermind/Nethermind.Trie.Test/TreePathTests.cs +++ b/src/Nethermind/Nethermind.Trie.Test/TreePathTests.cs @@ -199,6 +199,20 @@ public void TestScopedAppend() Assert.That(path.Length, Is.EqualTo(0)); } + [TestCase("", "000000")] + [TestCase("01", "100001")] + [TestCase("0001020304", "012345")] + public void TestEncodeWith3Byte(string nibbleHex, string expectedEncodedHex) + { + byte[] nibbles = string.IsNullOrEmpty(nibbleHex) ? [] : Bytes.FromHexString(nibbleHex); + TreePath path = TreePath.FromNibble(nibbles); + + Span buffer = stackalloc byte[3]; + path.EncodeWith3Byte(buffer); + + Assert.That(buffer.ToArray().ToHexString(), Is.EqualTo(expectedEncodedHex)); + } + [TestCase("", "0000000000000000")] [TestCase("01", "1000000000000001")] [TestCase("000102030405060708", "0123456780000009")] @@ -215,6 +229,39 @@ public void TestEncodeWith8Byte(string nibbleHex, string expectedEncodedHex) Assert.That(buffer.ToArray().ToHexString(), Is.EqualTo(expectedEncodedHex)); } + [TestCase("")] + [TestCase("01")] + [TestCase("00010203")] + [TestCase("0001020304")] + public void TestRoundtripWith3Byte(string nibbleHex) + { + byte[] nibbles = string.IsNullOrEmpty(nibbleHex) ? [] : Bytes.FromHexString(nibbleHex); + TreePath original = TreePath.FromNibble(nibbles); + + Span buffer = stackalloc byte[3]; + original.EncodeWith3Byte(buffer); + TreePath decoded = TreePath.DecodeWith3Byte(buffer); + + Assert.That(decoded, Is.EqualTo(original)); + } + + [TestCase("")] + [TestCase("01")] + [TestCase("000102030405060708")] + [TestCase("000102030405060708090a0b0c0d0e")] + [TestCase("000102030405")] + public void TestRoundtripWith8Byte(string nibbleHex) + { + byte[] nibbles = string.IsNullOrEmpty(nibbleHex) ? [] : Bytes.FromHexString(nibbleHex); + TreePath original = TreePath.FromNibble(nibbles); + + Span buffer = stackalloc byte[8]; + original.EncodeWith8Byte(buffer); + TreePath decoded = TreePath.DecodeWith8Byte(buffer); + + Assert.That(decoded, Is.EqualTo(original)); + } + private static TreePath CreateFullTreePath() { TreePath path = new(); diff --git a/src/Nethermind/Nethermind.Trie/TreePath.cs b/src/Nethermind/Nethermind.Trie/TreePath.cs index c9d2e81693f9..d59e2a9ebd08 100644 --- a/src/Nethermind/Nethermind.Trie/TreePath.cs +++ b/src/Nethermind/Nethermind.Trie/TreePath.cs @@ -408,6 +408,13 @@ public readonly ValueHash256 ToUpperBoundPath() public bool StartsWith(TreePath otherPath) => Truncate(otherPath.Length) == otherPath; + public readonly void EncodeWith3Byte(Span buffer) + { + Path.Bytes[..3].CopyTo(buffer); + byte lengthAsByte = (byte)Length; + buffer[3 - 1] = (byte)((buffer[3 - 1] & 0xf0) | (lengthAsByte & 0x0f)); + } + public readonly void EncodeWith8Byte(Span buffer) { Path.Bytes[..8].CopyTo(buffer); @@ -416,6 +423,24 @@ public readonly void EncodeWith8Byte(Span buffer) // Pack length into lower 4 bits of last byte (upper 4 bits contain path data) buffer[8 - 1] = (byte)((buffer[8 - 1] & 0xf0) | (lengthAsByte & 0x0f)); } + + public static TreePath DecodeWith3Byte(ReadOnlySpan buffer) + { + Span pathBytes = stackalloc byte[32]; + buffer[..3].CopyTo(pathBytes); + int length = pathBytes[2] & 0x0f; + pathBytes[2] = (byte)(pathBytes[2] & 0xf0); + return new TreePath(new ValueHash256(pathBytes), length); + } + + public static TreePath DecodeWith8Byte(ReadOnlySpan buffer) + { + Span pathBytes = stackalloc byte[32]; + buffer[..8].CopyTo(pathBytes); + int length = pathBytes[7] & 0x0f; + pathBytes[7] = (byte)(pathBytes[7] & 0xf0); + return new TreePath(new ValueHash256(pathBytes), length); + } } public static class TreePathExtensions