diff --git a/libs/cluster/Server/ClusterProvider.cs b/libs/cluster/Server/ClusterProvider.cs index f8a1469c938..47e54a4335b 100644 --- a/libs/cluster/Server/ClusterProvider.cs +++ b/libs/cluster/Server/ClusterProvider.cs @@ -74,10 +74,8 @@ public bool AllowDataLoss => serverOptions.AllowDataLoss; /// - public void Recover() - { - replicationManager.Recover(); - } + public ValueTask RecoverAsync() + => replicationManager.RecoverAsync(); /// public bool PreventRoleChange() diff --git a/libs/cluster/Server/Replication/ReplicaOps/ReplicaDiskbasedSync.cs b/libs/cluster/Server/Replication/ReplicaOps/ReplicaDiskbasedSync.cs index db03bcd7348..fb61bf108b7 100644 --- a/libs/cluster/Server/Replication/ReplicaOps/ReplicaDiskbasedSync.cs +++ b/libs/cluster/Server/Replication/ReplicaOps/ReplicaDiskbasedSync.cs @@ -168,7 +168,7 @@ async Task ReplicaSyncAttachTaskAsync(bool downgradeLock, bool forceAsyn cEntry = GetLatestCheckpointEntryFromDisk(); logger?.LogCheckpointEntry(LogLevel.Information, nameof(ReplicaSyncAttachTaskAsync), cEntry); - storeWrapper.RecoverAOF(); + await storeWrapper.RecoverAOFAsync().ConfigureAwait(false); logger?.LogInformation("InitiateReplicaSync: AOF BeginAddress:{beginAddress} AOF TailAddress:{tailAddress}", storeWrapper.appendOnlyFile.Log.BeginAddress, storeWrapper.appendOnlyFile.Log.TailAddress); var beginAddress = storeWrapper.appendOnlyFile.Log.BeginAddress; @@ -301,10 +301,12 @@ public AofAddress TryReplicaDiskbasedRecovery( remoteCheckpoint.metadata.storeIndexToken, remoteCheckpoint.metadata.storeHlogToken); - storeWrapper.RecoverCheckpoint( +#pragma warning disable VSTHRD002 // The replica-recovery RESP path is synchronous and must complete before sending a response. + storeWrapper.RecoverCheckpointAsync( replicaRecover: true, recoverStoreFromToken, - remoteCheckpoint.metadata); + remoteCheckpoint.metadata).AsTask().GetAwaiter().GetResult(); +#pragma warning restore VSTHRD002 if (replayAOFMap > 0) { diff --git a/libs/cluster/Server/Replication/ReplicationManager.cs b/libs/cluster/Server/Replication/ReplicationManager.cs index d42384fcbad..f5dbca52588 100644 --- a/libs/cluster/Server/Replication/ReplicationManager.cs +++ b/libs/cluster/Server/Replication/ReplicationManager.cs @@ -509,20 +509,20 @@ public void Dispose() /// /// Main recover method for replication /// - public void Recover() + public async ValueTask RecoverAsync() { var nodeRole = clusterProvider.clusterManager.CurrentConfig.LocalNodeRole; switch (nodeRole) { case NodeRole.PRIMARY: - RecoverCheckpointAndAOF(); + await RecoverCheckpointAndAOFAsync().ConfigureAwait(false); break; case NodeRole.REPLICA: // If configured, load from disk - otherwise wait to connect with a Primary if (clusterProvider.serverOptions.ClusterReplicaResumeWithData) { - RecoverCheckpointAndAOF(); + await RecoverCheckpointAndAOFAsync().ConfigureAwait(false); } break; @@ -535,10 +535,10 @@ public void Recover() /// /// Recover whatever is available from . /// - private void RecoverCheckpointAndAOF() + private async ValueTask RecoverCheckpointAndAOFAsync() { - storeWrapper.RecoverCheckpoint(); - storeWrapper.RecoverAOF(); + await storeWrapper.RecoverCheckpointAsync().ConfigureAwait(false); + await storeWrapper.RecoverAOFAsync().ConfigureAwait(false); if (clusterProvider.serverOptions.EnableAOF) { // If recovered checkpoint corresponds to an unavailable AOF address, we initialize AOF to that address @@ -555,7 +555,7 @@ private void RecoverCheckpointAndAOF() // First recover and then load latest checkpoint info in-memory if (!InitializeCheckpointStore()) - logger?.LogWarning("Failed acquiring latest memory checkpoint metadata at {method}", nameof(RecoverCheckpointAndAOF)); + logger?.LogWarning("Failed acquiring latest memory checkpoint metadata at {method}", nameof(RecoverCheckpointAndAOFAsync)); } /// diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 7c96e707e89..eae208466d9 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -485,7 +485,9 @@ private GarnetAppendOnlyFile CreateAOF(int dbId) /// public void Start() { - Provider.Recover(); +#pragma warning disable VSTHRD002 // Server startup is synchronous and must complete recovery before accepting connections. + Provider.RecoverAsync().AsTask().GetAwaiter().GetResult(); +#pragma warning restore VSTHRD002 for (var i = 0; i < servers.Length; i++) servers[i].Start(); Provider.Start(); diff --git a/libs/server/AOF/GarnetLog.cs b/libs/server/AOF/GarnetLog.cs index cd700ee16c5..76517f2ca1e 100644 --- a/libs/server/AOF/GarnetLog.cs +++ b/libs/server/AOF/GarnetLog.cs @@ -171,13 +171,8 @@ public AofAddress MemorySizeBytes } } - public void Recover() - { - if (singleLog != null) - singleLog.Recover(); - else - shardedLog.Recover(); - } + public ValueTask RecoverAsync() + => singleLog != null ? singleLog.RecoverAsync() : shardedLog.RecoverAsync(); public bool RecoverLatestSequenceNumber(out long recoverUntilSequenceNumber) { diff --git a/libs/server/AOF/ShardedLog.cs b/libs/server/AOF/ShardedLog.cs index 61822105b9a..bf496070563 100644 --- a/libs/server/AOF/ShardedLog.cs +++ b/libs/server/AOF/ShardedLog.cs @@ -4,6 +4,7 @@ using System.Diagnostics; using System.Linq; using System.Threading; +using System.Threading.Tasks; using Garnet.common; using Microsoft.Extensions.Logging; using Tsavorite.core; @@ -166,10 +167,10 @@ public AofAddress MemorySizeBytes } } - public void Recover() + public async ValueTask RecoverAsync() { foreach (var log in sublog) - log.Recover(); + await log.RecoverAsync().ConfigureAwait(false); } public void Reset() diff --git a/libs/server/AOF/SingleLog.cs b/libs/server/AOF/SingleLog.cs index bf3306522f4..5221916bdaf 100644 --- a/libs/server/AOF/SingleLog.cs +++ b/libs/server/AOF/SingleLog.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +using System.Threading.Tasks; using Microsoft.Extensions.Logging; using Tsavorite.core; @@ -39,7 +40,7 @@ public class SingleLog(TsavoriteLogSettings logSettings, ILogger logger = null) public AofAddress MemorySizeBytes => AofAddress.Create(1, value: log.MemorySizeBytes); - public void Recover() => log.Recover(); + public ValueTask RecoverAsync() => log.RecoverAsync(); public void Reset() => log.Reset(); public void Dispose() diff --git a/libs/server/Cluster/IClusterProvider.cs b/libs/server/Cluster/IClusterProvider.cs index 975adb210cb..79523e2eafa 100644 --- a/libs/server/Cluster/IClusterProvider.cs +++ b/libs/server/Cluster/IClusterProvider.cs @@ -101,7 +101,7 @@ public interface IClusterProvider : IDisposable /// /// Recover the cluster /// - void Recover(); + ValueTask RecoverAsync(); /// /// Reset gossip stats diff --git a/libs/server/Databases/DatabaseManagerBase.cs b/libs/server/Databases/DatabaseManagerBase.cs index 80384423290..24c2887cbba 100644 --- a/libs/server/Databases/DatabaseManagerBase.cs +++ b/libs/server/Databases/DatabaseManagerBase.cs @@ -35,7 +35,7 @@ internal abstract class DatabaseManagerBase : IDatabaseManager public abstract void ResumeCheckpoints(int dbId); /// - public abstract void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null); + public abstract ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null); /// public abstract Task TakeCheckpointAsync(bool background, int dbId = -1, CancellationToken token = default, ILogger logger = null); @@ -57,7 +57,7 @@ public abstract Task TaskCheckpointBasedOnAofSizeLimitAsync(long aofSizeLimit, public abstract Task WaitForCommitToAofAsync(CancellationToken token = default, ILogger logger = null); /// - public abstract void RecoverAOF(); + public abstract ValueTask RecoverAOFAsync(); /// public abstract AofAddress ReplayAOF(AofAddress untilAddress); @@ -164,18 +164,15 @@ protected DatabaseManagerBase(StoreWrapper.DatabaseCreatorDelegate createDatabas /// Recover single database from checkpoint /// /// Database to recover - /// Store version - protected void RecoverDatabaseCheckpoint(GarnetDatabase db, out long storeVersion) + protected async ValueTask RecoverDatabaseCheckpointAsync(GarnetDatabase db) { - storeVersion = 0; - - storeVersion = db.Store.Recover(); + var storeVersion = await db.Store.RecoverAsync().ConfigureAwait(false); Logger?.LogInformation("Recovered store to version {storeVersion}", storeVersion); if (storeVersion > 0) - { db.LastSaveTime = DateTimeOffset.UtcNow; - } + + return storeVersion; } /// @@ -227,11 +224,11 @@ protected static void ResumeCheckpoints(GarnetDatabase db) /// Recover a single database from AOF /// /// Database to recover - protected void RecoverDatabaseAOF(GarnetDatabase db) + protected async ValueTask RecoverDatabaseAOFAsync(GarnetDatabase db) { if (db.AppendOnlyFile == null) return; - db.AppendOnlyFile.Log.Recover(); + await db.AppendOnlyFile.Log.RecoverAsync().ConfigureAwait(false); Logger?.LogInformation("Recovered AOF: begin address = {beginAddress}, tail address = {tailAddress}, DB ID: {id}", db.AppendOnlyFile.Log.BeginAddress, db.AppendOnlyFile.Log.TailAddress, db.Id); } diff --git a/libs/server/Databases/IDatabaseManager.cs b/libs/server/Databases/IDatabaseManager.cs index e59176dba23..bf53e679797 100644 --- a/libs/server/Databases/IDatabaseManager.cs +++ b/libs/server/Databases/IDatabaseManager.cs @@ -82,7 +82,7 @@ public interface IDatabaseManager : IDisposable /// /// /// - public void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null); + public ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null); /// /// Take checkpoint of all active databases (or a specified database) if checkpointing is not in progress @@ -140,7 +140,7 @@ public Task TaskCheckpointBasedOnAofSizeLimitAsync(long aofSizeLimit, Cancellati /// /// Recover AOF /// - public void RecoverAOF(); + public ValueTask RecoverAOFAsync(); /// /// When replaying AOF we do not want to write AOF records again. diff --git a/libs/server/Databases/MultiDatabaseManager.cs b/libs/server/Databases/MultiDatabaseManager.cs index 1dfbd7e4324..c8b6a3c869f 100644 --- a/libs/server/Databases/MultiDatabaseManager.cs +++ b/libs/server/Databases/MultiDatabaseManager.cs @@ -81,11 +81,11 @@ public MultiDatabaseManager(SingleDatabaseManager src) : } /// - public override void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null) + public override async ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null) { if (replicaRecover) throw new GarnetException( - $"Unexpected call to {nameof(MultiDatabaseManager)}.{nameof(RecoverCheckpoint)} with {nameof(replicaRecover)} == true."); + $"Unexpected call to {nameof(MultiDatabaseManager)}.{nameof(RecoverCheckpointAsync)} with {nameof(replicaRecover)} == true."); var checkpointParentDir = StoreWrapper.serverOptions.StoreCheckpointBaseDirectory; var checkpointDirBaseName = GarnetServerOptions.GetCheckpointDirectoryName(0); @@ -116,7 +116,7 @@ public override void RecoverCheckpoint(bool replicaRecover = false, bool recover try { - RecoverDatabaseCheckpoint(db, out storeVersion); + storeVersion = await RecoverDatabaseCheckpointAsync(db).ConfigureAwait(false); } catch (TsavoriteNoHybridLogException ex) { @@ -416,7 +416,7 @@ public override async Task WaitForCommitToAofAsync(CancellationToken token = def } /// - public override void RecoverAOF() + public override async ValueTask RecoverAOFAsync() { var aofParentDir = StoreWrapper.serverOptions.AppendOnlyFileBaseDirectory; var aofDirBaseName = GarnetServerOptions.GetAppendOnlyFileDirectoryName(0); @@ -442,7 +442,7 @@ public override void RecoverAOF() if (!success) throw new GarnetException($"Failed to retrieve or create database for AOF recovery (DB ID = {dbId})."); - RecoverDatabaseAOF(db); + await RecoverDatabaseAOFAsync(db).ConfigureAwait(false); } } diff --git a/libs/server/Databases/SingleDatabaseManager.cs b/libs/server/Databases/SingleDatabaseManager.cs index 28508436cfa..ce9e6ab8197 100644 --- a/libs/server/Databases/SingleDatabaseManager.cs +++ b/libs/server/Databases/SingleDatabaseManager.cs @@ -54,7 +54,7 @@ public override GarnetDatabase TryGetOrAddDatabase(int dbId, out bool success, o } /// - public override void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null) + public override async ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null) { long storeVersion = 0; try @@ -64,7 +64,9 @@ public override void RecoverCheckpoint(bool replicaRecover = false, bool recover // Note: Since replicaRecover only pertains to cluster-mode, we can use the default store pointers (since multi-db mode is disabled in cluster-mode) if (metadata!.storeIndexToken != default && metadata.storeHlogToken != default) { - storeVersion = !recoverFromToken ? Store.Recover() : Store.Recover(metadata.storeIndexToken, metadata.storeHlogToken); + storeVersion = !recoverFromToken + ? await Store.RecoverAsync().ConfigureAwait(false) + : await Store.RecoverAsync(metadata.storeIndexToken, metadata.storeHlogToken).ConfigureAwait(false); } if (storeVersion > 0) @@ -72,7 +74,7 @@ public override void RecoverCheckpoint(bool replicaRecover = false, bool recover } else { - RecoverDatabaseCheckpoint(defaultDatabase, out storeVersion); + storeVersion = await RecoverDatabaseCheckpointAsync(defaultDatabase).ConfigureAwait(false); } } catch (TsavoriteNoHybridLogException ex) @@ -239,7 +241,7 @@ public override async Task WaitForCommitToAofAsync(CancellationToken token = def } /// - public override void RecoverAOF() => RecoverDatabaseAOF(defaultDatabase); + public override ValueTask RecoverAOFAsync() => RecoverDatabaseAOFAsync(defaultDatabase); /// public override AofAddress ReplayAOF(AofAddress untilAddress) diff --git a/libs/server/Providers/GarnetProvider.cs b/libs/server/Providers/GarnetProvider.cs index c27819f53f6..3c766d9ebef 100644 --- a/libs/server/Providers/GarnetProvider.cs +++ b/libs/server/Providers/GarnetProvider.cs @@ -2,6 +2,7 @@ // Licensed under the MIT license. using System.Threading; +using System.Threading.Tasks; using Garnet.common; using Garnet.networking; using Tsavorite.core; @@ -43,8 +44,8 @@ public void Start() /// /// Recover /// - public void Recover() - => storeWrapper.Recover(); + public ValueTask RecoverAsync() + => storeWrapper.RecoverAsync(); /// /// Dispose diff --git a/libs/server/StoreWrapper.cs b/libs/server/StoreWrapper.cs index cbc44a024c0..dfc81e3b438 100644 --- a/libs/server/StoreWrapper.cs +++ b/libs/server/StoreWrapper.cs @@ -360,19 +360,19 @@ public IPEndPoint GetClusterEndpoint() return localEndPoint; } - internal void Recover() + internal async ValueTask RecoverAsync() { if (serverOptions.EnableCluster) { if (serverOptions.Recover) - clusterProvider.Recover(); + await clusterProvider.RecoverAsync().ConfigureAwait(false); } else { if (serverOptions.Recover) { - RecoverCheckpoint(); - RecoverAOF(); + await RecoverCheckpointAsync().ConfigureAwait(false); + await RecoverAOFAsync().ConfigureAwait(false); ReplayAOF(AofAddress.Create(length: serverOptions.AofPhysicalSublogCount, value: -1)); } } @@ -413,10 +413,10 @@ public async Task TakeOnDemandCheckpointAsync(DateTimeOffset entryTime, int dbId /// /// Recover checkpoint /// - public void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null) + public async ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null) { StartSizeTrackers(); // We need to start this before recovery to have size tracking during the recovery process. - databaseManager.RecoverCheckpoint(replicaRecover, recoverFromToken, metadata); + await databaseManager.RecoverCheckpointAsync(replicaRecover, recoverFromToken, metadata).ConfigureAwait(false); } /// @@ -447,7 +447,7 @@ public void ResumeCheckpoints(int dbId = 0) /// /// Recover AOF /// - public void RecoverAOF() => databaseManager.RecoverAOF(); + public ValueTask RecoverAOFAsync() => databaseManager.RecoverAOFAsync(); /// /// When replaying AOF we do not want to write AOF records again. diff --git a/libs/storage/Tsavorite/cs/benchmark/YCSB.benchmark/TestLoader.cs b/libs/storage/Tsavorite/cs/benchmark/YCSB.benchmark/TestLoader.cs index fe040a586e0..fe605e77dcd 100644 --- a/libs/storage/Tsavorite/cs/benchmark/YCSB.benchmark/TestLoader.cs +++ b/libs/storage/Tsavorite/cs/benchmark/YCSB.benchmark/TestLoader.cs @@ -386,7 +386,7 @@ internal bool MaybeRecoverStore(TsavoriteKV store) try { var sw = Stopwatch.StartNew(); - store.Recover(); + store.RecoverAsync().AsTask().GetAwaiter().GetResult(); sw.Stop(); Console.WriteLine($" Completed recovery in {(double)sw.ElapsedMilliseconds / 1000:N3} seconds"); return true; diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs index c3cf6d505a0..51ca3b76565 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs @@ -33,6 +33,10 @@ public abstract class AllocatorBase internal virtual ObjectLogFilePositionInfo GetObjectLogTail() => new(); // This marks it as "unset" /// Set the ObjectLog tail position, if this is ObjectAllocator. internal virtual void SetObjectLogTail(ObjectLogFilePositionInfo tail) { } + /// Calculate the total serialized object size on a loaded page. Only implemented by ObjectAllocator. + internal virtual long CalculatePageObjectSizes(long page, long startAddress, long untilAddress) => 0; + /// Load objects for records on an already-loaded page for recovery pass 2. + internal virtual void LoadObjectsForRecoveryPass2(long page, long fromAddress, long untilAddress, IDevice objectLogDevice) { } } /// @@ -546,8 +550,8 @@ private protected AllocatorBase(AllocatorSettings allocatorSettings, TStoreFunct throw new TsavoriteException($"{nameof(logSettings.SegmentSizeBits)} must be between {LogSettings.kMinMainLogSegmentSizeBits} and {LogSettings.kMaxSegmentSizeBits}"); if (logSettings.MemorySize != 0 && (logSettings.MemorySize < 1L << LogSettings.kMinMemorySizeBits || logSettings.MemorySize > 1L << LogSettings.kMaxMemorySizeBits)) throw new TsavoriteException($"{nameof(logSettings.MemorySize)} must be between {1L << LogSettings.kMinMemorySizeBits} and {1L << LogSettings.kMaxMemorySizeBits}, or may be 0 for ReadOnly TsavoriteLog"); - if ((logSettings.MemorySize != 0) && (logSettings.MemorySize < (1L << logSettings.PageSizeBits) * 2)) - throw new TsavoriteException($"{nameof(logSettings.MemorySize)} must be at least twice the page size ({1L << logSettings.PageSizeBits})"); + if ((logSettings.MemorySize != 0) && (logSettings.MemorySize < (1L << logSettings.PageSizeBits) * LogSettings.kMinPageCount)) + throw new TsavoriteException($"{nameof(logSettings.MemorySize)} must be at least {LogSettings.kMinPageCount}x the page size ({1L << logSettings.PageSizeBits})"); if (logSettings.MutableFraction < 0.0 || logSettings.MutableFraction > 1.0) throw new TsavoriteException($"{nameof(logSettings.MutableFraction)} must be >= 0.0 and <= 1.0"); if (logSettings.ReadCacheSettings is not null) @@ -926,13 +930,7 @@ void AllocatePagesWithException(int pageIndex, PageOffset localTailPageOffset, i { try { - // Allocate this page, if needed - if (!IsAllocated(pageIndex % BufferSize)) - _wrapper.AllocatePage(pageIndex % BufferSize); - - // Allocate next page in advance, if needed - if (!IsAllocated((pageIndex + 1) % BufferSize)) - _wrapper.AllocatePage((pageIndex + 1) % BufferSize); + AllocateCurrentAndNextPage(pageIndex); } catch { @@ -943,6 +941,25 @@ void AllocatePagesWithException(int pageIndex, PageOffset localTailPageOffset, i } } + /// + /// Allocate the page containing and, as the allocator's allocate-ahead invariant, the page following it, each only if it is + /// not already allocated. + /// + /// The page number whose page (and the next page) should be allocated. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void AllocateCurrentAndNextPage(long page) + { + // Allocate the current page, if needed. + var pageIndex = (int)(page % BufferSize); + if (!IsAllocated(pageIndex)) + _wrapper.AllocatePage(pageIndex); + + // Allocate the next page in advance (an invariant in the allocator), if needed. + var nextPageIndex = (pageIndex + 1) % BufferSize; + if (!IsAllocated(nextPageIndex)) + _wrapper.AllocatePage(nextPageIndex); + } + /// /// Shift log read-only address, with an optional wait /// @@ -1027,7 +1044,7 @@ bool NeedToShiftAddress(long pageIndex, PageOffset localTailPageOffset, int numS // First check whether we need to shift HeadAddress. If we have a logSizeTracker that's over budget then we have already issued // a shift if needed (and allowed by allocated page count); otherwise make sure we stay in the MaxAllocatedPageCount (which may be less than BufferSize). var desiredHeadAddress = HeadAddress; - if (logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimit) + if (logSizeTracker is null || !logSizeTracker.IsOverBudget) { var headPage = GetPage(desiredHeadAddress); if (pageIndex - headPage >= MaxAllocatedPageCount) @@ -1062,7 +1079,7 @@ void IssueShiftAddress(long pageIndex, bool needSHA) // First check whether we need to shift HeadAddress. If we are not forcing for flush and have a logSizeTracker that's over budget then we have already issued // a shift if needed (and allowed by allocated page count); otherwise make sure we stay in the MaxAllocatedPageCount (which may be less than BufferSize). var desiredHeadAddress = HeadAddress; - if (needSHA || logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimit) + if (needSHA || logSizeTracker is null || !logSizeTracker.IsOverBudget) { var headPage = GetPage(desiredHeadAddress); if (pageIndex - headPage >= MaxAllocatedPageCount) @@ -1396,6 +1413,13 @@ public void ShiftBeginAddress(long newBeginAddress, bool truncateLog, bool noFlu } } + /// Find the head address cutoff on a page for partial object loading. Only implemented by ObjectAllocator. + internal virtual long FindHeadAddressCutoffOnPage(long page, long untilAddress, long totalPageObjectSize, int numPagesBelowCurrentPage, long remainingBudget, out int numPagesBelowToEvict) + { + numPagesBelowToEvict = 0; + return GetFirstValidLogicalAddressOnPage(page); + } + /// Invokes eviction observer if set and then frees the page. internal void EvictPageForRecovery(long page) { @@ -1405,10 +1429,11 @@ internal void EvictPageForRecovery(long page) var source = IsReadCache ? EvictionSource.ReadCache : EvictionSource.MainLog; // Per-record eviction walk handles internal heap accounting (key + value via - // logSizeTracker) and optionally notifies the application via OnEvict. + // logSizeTracker) and optionally notifies the application via OnEvict. isRecovery: true so that pages whose + // object load was deferred (empty ObjectIdMap, un-deserialized object/overflow slots) are skipped. if (logSizeTracker is not null || storeFunctions.CallOnEvict) { - _wrapper.EvictRecordsInRange(start, end, source); + _wrapper.EvictRecordsInRange(start, end, source, isRecovery: true); } if (onEvictionObserver is not null) { @@ -1496,7 +1521,7 @@ private void OnPagesClosedWorker() // via OnEvict for app-level cleanup. var evictSource = IsReadCache ? EvictionSource.ReadCache : EvictionSource.MainLog; if (logSizeTracker is not null || storeFunctions.CallOnEvict) - _wrapper.EvictRecordsInRange(start, end, evictSource); + _wrapper.EvictRecordsInRange(start, end, evictSource, isRecovery: false); // If we are using a null storage device, we must also shift BeginAddress (leave it in-memory) if (IsNullDevice) @@ -1632,15 +1657,8 @@ protected internal virtual void RecoveryReset(long tailAddress, long headAddress if (pageHeaderSize > 0 && TailPageOffset.Offset == 0) TailPageOffset.Offset = pageHeaderSize; - // Allocate current page if necessary - var pageIndex = TailPageOffset.Page % BufferSize; - if (!IsAllocated(pageIndex)) - _wrapper.AllocatePage(pageIndex); - - // Allocate next page as well - this is an invariant in the allocator! - var nextPageIndex = (pageIndex + 1) % BufferSize; - if (!IsAllocated(nextPageIndex)) - _wrapper.AllocatePage(nextPageIndex); + // Allocate the current page and the next page (the allocate-ahead invariant) if necessary. + AllocateCurrentAndNextPage(TailPageOffset.Page); BeginAddress = beginAddress; HeadAddress = headAddress; @@ -1651,7 +1669,7 @@ protected internal virtual void RecoveryReset(long tailAddress, long headAddress SafeReadOnlyAddress = readonlyAddress; // for the last page which contains tailoffset, it must be open - pageIndex = GetPageIndexForAddress(tailAddress); + var pageIndex = GetPageIndexForAddress(tailAddress); // clear the last page starting from tail address ClearPage(pageIndex, (int)GetOffsetOnPage(tailAddress)); @@ -1718,14 +1736,14 @@ private SectorAlignedMemory GetAndPopulateReadBuffer(long fromLogicalAddress, in /// Read pages from specified device(s) for recovery, with no output of the countdown event (but it is still created in the /// and thus must be Dispose()d). - public void AsyncReadPagesForRecovery(long readPageStart, int numPages, long untilAddress, TContext context, - long devicePageOffset = 0, IDevice logDevice = null, IDevice objectLogDevice = null) - => AsyncReadPagesForRecovery(readPageStart, numPages, untilAddress, context, out _, devicePageOffset, logDevice, objectLogDevice); + internal void AsyncReadPagesForRecovery(long readPageStart, int numPages, long untilAddress, TContext context, + long devicePageOffset = 0, IDevice logDevice = null, IDevice objectLogDevice = null, RecoveryPhase recoveryPhase = RecoveryPhase.Pass1) + => AsyncReadPagesForRecovery(readPageStart, numPages, untilAddress, context, out _, devicePageOffset, logDevice, objectLogDevice, recoveryPhase); /// Read pages from specified device for recovery, returning the countdown event [MethodImpl(MethodImplOptions.NoInlining)] private void AsyncReadPagesForRecovery(long readPageStart, int numPages, long untilAddress, TContext context, - out CountdownEvent completed, long devicePageOffset = 0, IDevice logDevice = null, IDevice objectLogDevice = null) + out CountdownEvent completed, long devicePageOffset = 0, IDevice logDevice = null, IDevice objectLogDevice = null, RecoveryPhase recoveryPhase = RecoveryPhase.Pass1) { var usedDevice = logDevice ?? this.device; @@ -1745,7 +1763,7 @@ private void AsyncReadPagesForRecovery(long readPageStart, int numPage context = context, handle = completed, maxAddressOffsetOnPage = PageSize, - isForRecovery = true + recoveryPhase = recoveryPhase }; var offsetInFile = (ulong)(AlignedPageSizeBytes * readPage); @@ -1764,9 +1782,12 @@ private void AsyncReadPagesForRecovery(long readPageStart, int numPage if (logDevice != null) offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset)); - // Create separate readBuffers for each main-log page, as each page launches its own async read and callbacks are on different threads. - // Do *not* use "using" here as we need it to survive to the ReadAsync AsyncReadPagesForRecoveryCallback. - asyncResult.readBuffers = CreateCircularReadBuffers(objectLogDevice, logger); + if (recoveryPhase == RecoveryPhase.Pass2) + { + // Create separate readBuffers for each main-log page, as each page launches its own async read and callbacks are on different threads. + // Do *not* use "using" here as we need it to survive to the ReadAsync AsyncReadPagesForRecoveryCallback. + asyncResult.readBuffers = CreateCircularReadBuffers(objectLogDevice, logger); + } // Call the overridden ReadAsync for the derived allocator class ReadAsync(offsetInFile, (IntPtr)pagePointers[pageIndex], readLength, AsyncReadPagesForRecoveryCallback, asyncResult, usedDevice); @@ -1881,11 +1902,23 @@ private protected bool PrepareFlushAsyncResult(long fromAddress, long untilAddre } /// - /// Flush pages asynchronously for recovery (such as when we have invalidated v+1 records). + /// Flush pages asynchronously for recovery (such as when we have invalidated v+1 records, or when replaying snapshot pages into the main log). /// - public void AsyncFlushPagesForRecovery(long scanFromAddress, long flushPageStart, int numPages, DeviceIOCompletionCallback callback, TContext context) + /// The lowest address being flushed on + /// First page to flush + /// Number of pages to flush + /// Flush completion callback + /// Callback context + /// For the snapshot-replay flush, the snapshot object-log device whose object bytes (for records at/above + /// ) are copied into the main object-log during the flush. Null for non-object or hybrid-log-only flushes. + /// The former FlushedUntilAddress (hybrid-log/snapshot boundary); records at/above it have their objects copied. + public void AsyncFlushPagesForRecovery(long scanFromAddress, long flushPageStart, int numPages, DeviceIOCompletionCallback callback, TContext context, + IDevice snapshotObjectLogDevice = null, long formerFlushedUntilAddress = long.MaxValue) { Debug.Assert(scanFromAddress < GetLogicalAddressOfStartOfPage(flushPageStart + 1), $"scanFromAddress ({scanFromAddress}) must be on flushPageStart ({flushPageStart})"); + + // When copying snapshot object bytes into the main object-log, we need write buffers on the main object-log device (as for a normal flush). + var copyObjects = snapshotObjectLogDevice is not null; for (var flushPage = flushPageStart; flushPage < (flushPageStart + numPages); flushPage++) { var asyncResult = new PageAsyncFlushResult() @@ -1896,11 +1929,14 @@ public void AsyncFlushPagesForRecovery(long scanFromAddress, long flus partial = false, fromAddress = Math.Max(scanFromAddress, GetLogicalAddressOfStartOfPage(flushPage)), untilAddress = GetLogicalAddressOfStartOfPage(flushPage + 1), - flushRequestState = FlushRequestState.Recovery + flushRequestState = FlushRequestState.Recovery, + recoverySnapshotObjectLogDevice = snapshotObjectLogDevice, + recoveryFormerFlushedUntilAddress = formerFlushedUntilAddress, + flushBuffers = copyObjects ? CreateCircularFlushBuffers(objectLogDevice: null, logger) : null }; - // For OA, we do not use FlushBuffers here; we set isForRecovery to reuse the stored lengths rather than re-serializing objects, - // using the lengths filled in during deserialization in RecoverHybridLog(Async), and when that is complete we fill in objectLogTail. + // For the snapshot region (records at/above formerFlushedUntilAddress) we copy object bytes from the snapshot object-log to the main + // object-log using flushBuffers; otherwise (hybrid-log region) we reuse the stored lengths/positions without writing object bytes. WriteAsync(flushPage, callback, asyncResult); } } diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs index 6c1fe626382..6673eddb72d 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs @@ -100,6 +100,9 @@ RecordSizeInfo GetDeleteRecordSize(TKey key) /// Return the for transient log records (e.g. iterator) ObjectIdMap TransientObjectIdMap { get; } + /// Return the for a specific page number (not index) + ObjectIdMap GetPageObjectIdMap(long pageNumber); + /// Dispose an in-memory log record void OnDispose(ref LogRecord logRecord, DisposeReason disposeReason); @@ -116,6 +119,9 @@ RecordSizeInfo GetDeleteRecordSize(TKey key) /// Start logical address of the range. /// End logical address of the range (exclusive). /// Identifies whether this eviction is from the main log or the read cache. - void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source); + /// True when called during recovery, where a page's object load may have been deferred — such a page has an empty + /// ObjectIdMap and per-record object/overflow slots that still hold raw on-disk values (not valid ObjectIdMap ids) and is skipped. False for + /// normal eviction, where an empty map simply means an object-free page whose inline records must still be visited. + void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery); } } \ No newline at end of file diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/LogRecord.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/LogRecord.cs index 5ad957a162c..58854f250a4 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/LogRecord.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/LogRecord.cs @@ -1549,6 +1549,32 @@ internal readonly void SetObjectLogRecordStartPositionAndLength(in ObjectLogFile SetDataHeader(dataHeader); } + /// + /// Repoints this record's object-log position word to without touching the R11-encoded + /// key/value lengths (in the RDH fields and the int* slots at keyAddress/valueAddress) or the . + /// + /// The new object-log position (e.g. the main object-log position a snapshot record's bytes were copied to). + /// + /// Used by the snapshot-recovery flush, which copies a record's object bytes from the snapshot object-log to the main object-log and must + /// repoint the disk-image record to the main position. The record's objects are NOT deserialized at this point (objectIdMap is empty and the + /// int* slots still hold the on-disk R11 length high-bits), so unlike and + /// this must not read the lengths from objectIdMap. The existing R11 length encoding + /// is preserved as-is, since the copied lengths are unchanged. + /// IMPORTANT: Like the other position setters, this is only safe to call on the disk-image copy of the record (srcBuffer). + /// + internal readonly void RepointObjectLogPosition(in ObjectLogFilePositionInfo objectLogFilePosition) + { + if (DataHeader.RecordIsInline) + { + Debug.Fail("Cannot call RepointObjectLogPosition for an inline record"); + return; + } + + var (valueLength, valueAddress) = DataHeader.GetValueFieldInfo(physicalAddress); + var objectLogPositionPtr = (ulong*)GetObjectLogPositionAddress(valueAddress + valueLength); + *objectLogPositionPtr = objectLogFilePosition.word | ObjectLogFilePositionInfo.kReuseObjectIdForSizeMask; + } + /// /// Returns the object log position for the start of the key (if any) and value (if any), with the length encoded per R11: /// (low N bits from RDH KeyLength/ValueLength) + (next 32 bits from int* slot at keyAddress/valueAddress). diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/MallocFixedPageSize.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/MallocFixedPageSize.cs index 02fa68910e7..8604c1250b9 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/MallocFixedPageSize.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/MallocFixedPageSize.cs @@ -370,18 +370,6 @@ private unsafe void AsyncFlushCallback(uint errorCode, uint numBytes, object con #endregion #region Recover - /// - /// Recover - /// - /// - /// - /// - /// - public void Recover(IDevice device, ulong offset, int buckets, ulong numBytes) - { - BeginRecovery(device, offset, buckets, numBytes, out _); - } - /// /// Recover /// @@ -397,22 +385,6 @@ public async ValueTask RecoverAsync(IDevice device, ulong offset, int buc return numBytesRead; } - /// - /// Check if recovery complete - /// - /// - /// - public bool IsRecoveryCompleted(bool waitUntilComplete = false) - { - bool completed = recoveryCountdown.IsCompleted; - if (!completed && waitUntilComplete) - { - recoveryCountdown.Wait(); - return true; - } - return completed; - } - // Implementation of asynchronous recovery private CountdownWrapper recoveryCountdown; diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocator.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocator.cs index 2c8983cb54c..f913b275b10 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocator.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocator.cs @@ -129,6 +129,9 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key) /// public readonly ObjectIdMap TransientObjectIdMap => _this.transientObjectIdMap; + /// + public readonly ObjectIdMap GetPageObjectIdMap(long pageNumber) => _this.objectPages[_this.GetPageIndexForPage(pageNumber)].objectIdMap; + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public readonly void OnDispose(ref LogRecord logRecord, DisposeReason disposeReason) => _this.OnDispose(ref logRecord, disposeReason); @@ -138,6 +141,6 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key) public readonly void OnDisposeDiskRecord(ref DiskLogRecord logRecord, DisposeReason disposeReason) => _this.OnDisposeDiskRecord(ref logRecord, disposeReason); /// - public readonly void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source) => _this.EvictRecordsInRange(startAddress, endAddress, source); + public readonly void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery) => _this.EvictRecordsInRange(startAddress, endAddress, source, isRecovery); } } \ No newline at end of file diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocatorImpl.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocatorImpl.cs index 4344f27eed7..2b25d7fd5d8 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocatorImpl.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocatorImpl.cs @@ -384,7 +384,7 @@ internal void OnDisposeDiskRecord(ref DiskLogRecord logRecord, DisposeReason dis /// ), so this routine walks records /// within that single page only. /// - internal void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source) + internal void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery) { var startPage = GetPage(startAddress); var firstValidAddress = GetFirstValidLogicalAddressOnPage(startPage); @@ -392,10 +392,17 @@ internal void EvictRecordsInRange(long startAddress, long endAddress, EvictionSo var pageEndAddress = GetLogicalAddressOfStartOfPage(startPage + 1); var stopAddress = endAddress < pageEndAddress ? endAddress : pageEndAddress; + // During recovery a page whose object load was deferred has an empty ObjectIdMap, and its per-record object/overflow slots still hold raw + // on-disk values, not valid ObjectIdMap ids; they must not be dereferenced and the records are not yet materialized, so skip the page. + // (In normal eviction an empty map merely means an object-free page, whose inline records must still be visited for OnEvict.) + var objectIdMap = objectPages[GetPageIndexForAddress(address)].objectIdMap; + if (isRecovery && objectIdMap.IsEmpty) + return; + while (address < stopAddress) { var physicalAddress = GetPhysicalAddress(address); - var logRecord = new LogRecord(physicalAddress, objectPages[GetPageIndexForAddress(address)].objectIdMap); + var logRecord = new LogRecord(physicalAddress, objectIdMap); var allocatedSize = logRecord.AllocatedSize; if (allocatedSize <= 0) @@ -765,6 +772,14 @@ private void WriteAsync(long flushPage, ulong alignedMainLogFlushPageA // Overflow Keys and Values are written to, and Object values are serialized to, this Stream, if we have flushBuffers. ObjectLogWriter logWriter = null; + // For a snapshot-region recovery flush, the reader over the snapshot object-log device from which each record's object bytes are + // copied into the main object-log (appended via logWriter). Null for non-recovery flushes and for the hybrid-log region (whose + // objects are already durable in the main object-log). + var isSnapshotRecoveryCopy = asyncResult.recoverySnapshotObjectLogDevice is not null; + var formerFlushedUntilAddress = asyncResult.recoveryFormerFlushedUntilAddress; + CircularDiskReadBuffer snapshotObjectReadBuffers = null; + ObjectLogReader snapshotObjectReader = null; + // Do everything below here in the try{} to be sure the epoch is Resumed()d if we Suspended it. SectorAlignedMemory srcBuffer = default; try @@ -830,6 +845,7 @@ private void WriteAsync(long flushPage, ulong alignedMainLogFlushPageA var recoveryOngoingPageHeader = asyncResult.flushRequestState == FlushRequestState.Recovery ? pageHeader.GetLowestObjectLogPosition(objectLogTail.SegmentSizeBits) : default; var endLogicalAddress = logicalAddress + (endPhysicalAddress - physicalAddress); + while (physicalAddress < endPhysicalAddress) { // Increment for next iteration; use allocatedSize because that is what LogicalAddress is based on. @@ -908,10 +924,38 @@ private void WriteAsync(long flushPage, ulong alignedMainLogFlushPageA } else { - // In recovery we just need to update the disk-image LogRecord with the object lengths and file position, and then - // advance the recoveryOngoingPageHeader position. This advancement will also take care of segment breaks if needed. - var objectLengths = logRecord.SetRecoveredObjectLogRecordStartPosition(recoveryOngoingPageHeader); - recoveryOngoingPageHeader.Advance(objectLengths); + if (isSnapshotRecoveryCopy && logicalAddress >= formerFlushedUntilAddress) + { + // Snapshot-region recovery flush: the record's objects live only in the snapshot object-log. Copy their bytes + // into the main object-log (appended at the current objectLogTail via logWriter) so the page becomes durable and + // can be evicted, then repoint the disk-image record to that main object-log position. The objects are NOT + // deserialized at this point, so read the position/lengths from the R11 encoding (not from objectIdMap), and use + // RepointObjectLogPosition (which preserves the unchanged lengths) rather than SetRecoveredObjectLogRecordStartPosition. + var snapshotPositionWord = logRecord.GetObjectLogRecordStartPositionAndLengths(out var copyKeyLength, out var copyValueLength); + var copyObjectLength = (ulong)copyKeyLength + copyValueLength; + + // Demand-load the snapshot object reader on the first valid record with objects, so pages with few or no object + // records avoid an up-front full-page pre-pass. The read-ahead range is sized by scanning forward from here. + snapshotObjectReader ??= CreateSnapshotObjectReader(physicalAddress + logRecordSize, endPhysicalAddress, snapshotPositionWord, + copyKeyLength, copyValueLength, asyncResult.recoverySnapshotObjectLogDevice, out snapshotObjectReadBuffers); + + var mainRecordPosition = logWriter.GetNextRecordStartPosition(); + + // Position/await the snapshot read buffers at this record (skips sector padding and waits for the read-ahead IO), + // then stream the record's bytes verbatim into the main object-log. + if (!snapshotObjectReadBuffers.OnBeginRecord(new ObjectLogFilePositionInfo(snapshotPositionWord, objectLogTail.SegmentSizeBits))) + throw new TsavoriteException("No snapshot object-log data available while copying objects during recovery"); + logWriter.CopyRecoveredObjectBytes(snapshotObjectReader, copyObjectLength); + logRecord.RepointObjectLogPosition(mainRecordPosition); + recoveryOngoingPageHeader.Advance(copyObjectLength); + } + else + { + // In recovery we just need to update the disk-image LogRecord with the object lengths and file position, and then + // advance the recoveryOngoingPageHeader position. This advancement will also take care of segment breaks if needed. + var objectLengths = logRecord.SetRecoveredObjectLogRecordStartPosition(recoveryOngoingPageHeader); + recoveryOngoingPageHeader.Advance(objectLengths); + } } // Do this for both cases so it's clear when debugging @@ -956,7 +1000,43 @@ private void WriteAsync(long flushPage, ulong alignedMainLogFlushPageA if (protectEpochWhenDone) epoch.Resume(); logWriter?.Dispose(); + snapshotObjectReader?.OnEndReadRecords(); + snapshotObjectReadBuffers?.Dispose(); + } + } + + /// + /// Demand-loads (creates and seeds) the reader over the snapshot object-log for a snapshot-region recovery flush, on the first valid record + /// with objects on the page. The read-ahead range is sized by scanning forward from to the last object + /// record on the page, so pages with few or no object records avoid an up-front full-page pre-pass. + /// + /// The (disk-image) address of the record just after the first object record. + /// The end of the page's records in the disk image. + /// The snapshot object-log position word of the first object record (the read-ahead start). + /// The first object record's key length. + /// The first object record's value length. + /// The snapshot object-log device to read from. + /// Outputs the created read buffers; the caller disposes them. + private ObjectLogReader CreateSnapshotObjectReader(long nextRecordAddress, long endPhysicalAddress, ulong firstPositionWord, + int firstKeyLength, ulong firstValueLength, IDevice snapshotObjectLogDevice, out CircularDiskReadBuffer readBuffers) + { + var startPosition = new ObjectLogFilePositionInfo(firstPositionWord, objectLogTail.SegmentSizeBits); + var endPosition = startPosition; + var endKeyLength = firstKeyLength; + var endValueLength = firstValueLength; + for (var scanAddress = nextRecordAddress; scanAddress < endPhysicalAddress;) + { + var scanRecord = new LogRecord(scanAddress); + scanAddress += scanRecord.AllocatedSize; + if (scanRecord.Info.Valid && scanRecord.DataHeader.RecordHasObjects) + endPosition = new(scanRecord.GetObjectLogRecordStartPositionAndLengths(out endKeyLength, out endValueLength), objectLogTail.SegmentSizeBits); } + endPosition.Advance((ulong)endKeyLength + endValueLength); + + readBuffers = CreateCircularReadBuffers(snapshotObjectLogDevice, logger); + var reader = new ObjectLogReader(readBuffers, storeFunctions); + reader.OnBeginReadRecords(startPosition, endPosition - startPosition); + return reader; } /// @@ -1120,75 +1200,216 @@ private void AsyncReadPageWithObjectsCallback(uint errorCode, uint num return; } - var pageStartAddress = (long)result.destinationPtr; + // If this is Recovery Pass 1 we skip object deserialization (frame reads are in RecoveryPhase.None). + if (result.recoveryPhase != RecoveryPhase.Pass1) + { + var objectIdMapToUse = result.recoveryPhase != RecoveryPhase.None ? objectPages[result.page % BufferSize].objectIdMap : transientObjectIdMap; + DeserializeObjectsOnPage((long)result.destinationPtr, result.maxAddressOffsetOnPage, objectIdMapToUse, result.readBuffers); + } + + // Call the "real" page read callback + result.callback(errorCode, numBytes, context); + result.Free(); + } - // Iterate all records in range to determine how many bytes we need to read from objlog. + /// + /// Deserialize objects on a page that has already been loaded into memory (physical addresses). + /// Scans records to determine object log ranges, reads from the object log via the provided readBuffers, and deserializes objects. + /// + /// Physical start address of the page in memory + /// Maximum offset on the page (PageSize or less for partial pages) + /// The ObjectIdMap to use for deserialized objects + /// The circular read buffers for object log reading + private void DeserializeObjectsOnPage(long pageStartPhysicalAddress, long maxAddressOffsetOnPage, ObjectIdMap objectIdMap, CircularDiskReadBuffer readBuffers) + { ObjectLogFilePositionInfo startPosition = new(), endPosition = new(); var endKeyLength = 0; ulong endValueLength = 0; - ulong totalBytesToRead = 0; - var recordAddress = pageStartAddress + PageHeader.Size; - var endAddress = pageStartAddress + result.maxAddressOffsetOnPage; + var recordAddress = pageStartPhysicalAddress + PageHeader.Size; + var endAddress = pageStartPhysicalAddress + maxAddressOffsetOnPage; + // First pass: determine the range of object log bytes to read while (recordAddress < endAddress) { - // Increment for next iteration; use allocatedSize because that is what LogicalAddress is based on. var logRecord = new LogRecord(recordAddress); recordAddress += logRecord.AllocatedSize; if (logRecord.DataHeader.RecordHasObjects && logRecord.Info.Valid) { - if (!startPosition.IsSet) - startPosition = new(logRecord.GetObjectLogRecordStartPositionAndLengths(out _, out _), objectLogTail.SegmentSizeBits); endPosition = new(logRecord.GetObjectLogRecordStartPositionAndLengths(out endKeyLength, out endValueLength), objectLogTail.SegmentSizeBits); + if (!startPosition.IsSet) + startPosition = endPosition; } } // The page may not have contained any records with objects - if (startPosition.IsSet) - { - endPosition.Advance((ulong)endKeyLength + endValueLength); - totalBytesToRead = endPosition - startPosition; + if (!startPosition.IsSet) + return; - // Iterate all records again to actually do the deserialization. - result.readBuffers.nextFileReadPosition = startPosition; - recordAddress = pageStartAddress + PageHeader.Size; - var logReader = new ObjectLogReader(result.readBuffers, storeFunctions); - logReader.OnBeginReadRecords(startPosition, totalBytesToRead); + endPosition.Advance((ulong)endKeyLength + endValueLength); + var totalBytesToRead = endPosition - startPosition; - var objectIdMapToUse = result.isForRecovery ? objectPages[result.page % BufferSize].objectIdMap : transientObjectIdMap; + // Second pass: deserialize objects + readBuffers.nextFileReadPosition = startPosition; + recordAddress = pageStartPhysicalAddress + PageHeader.Size; + var logReader = new ObjectLogReader(readBuffers, storeFunctions); + logReader.OnBeginReadRecords(startPosition, totalBytesToRead); + try + { while (recordAddress < endAddress) { - // Increment for next iteration; use allocatedSize because that is what LogicalAddress is based on. - var logRecord = new LogRecord(recordAddress, objectIdMapToUse); + var logRecord = new LogRecord(recordAddress, objectIdMap); recordAddress += logRecord.AllocatedSize; if (logRecord.DataHeader.RecordHasObjects && logRecord.Info.Valid) { _ = logReader.ReadRecordObjects(ref logRecord, default(EmptyKey), startPosition.SegmentSizeBits); - // CalculateHeapMemorySize returns 0 for tombstones, but eviction subtracts - // key overflow for tombstoned records. Add it here so the tracker stays balanced. - if (logRecord.Info.Tombstone) - { - if (logRecord.DataHeader.KeyIsOverflow) - logSizeTracker?.IncrementSize(logRecord.KeyOverflow.HeapMemorySize); - } - else - { - logSizeTracker?.UpdateSize(in logRecord, add: true); - } + TrackRecoveredObjectRecord(in logRecord); } } - - // Ensure we have finished all object reads + } + finally + { logReader.OnEndReadRecords(); } + } - // Call the "real" page read callback - result.callback(errorCode, numBytes, context); - result.Free(); - return; + private void TrackRecoveredObjectRecord(in LogRecord logRecord) + { + if (logSizeTracker is null) + return; + + // CalculateHeapMemorySize returns 0 for tombstones, but eviction subtracts + // key overflow for tombstoned records. Add it here so the tracker stays balanced. + if (logRecord.Info.Tombstone) + { + if (logRecord.DataHeader.KeyIsOverflow) + logSizeTracker.IncrementSize(logRecord.KeyOverflow.HeapMemorySize); + } + else + logSizeTracker.UpdateSize(in logRecord, add: true); + } + + /// + internal override long CalculatePageObjectSizes(long page, long startAddress, long untilAddress) + { + var recordAddress = Math.Max(startAddress, GetFirstValidLogicalAddressOnPage(page)); + var endAddress = Math.Min(untilAddress, GetLogicalAddressOfStartOfPage(page + 1)); + long totalSize = 0; + + while (recordAddress < endAddress) + { + var logRecord = new LogRecord(GetPhysicalAddress(recordAddress)); + var allocatedSize = logRecord.AllocatedSize; + if (allocatedSize <= 0) + ThrowTsavoriteException($"LogRecord size should be > 0; encountered {allocatedSize}"); + + recordAddress += allocatedSize; + if (recordAddress > endAddress) + ThrowTsavoriteException($"Unaligned end of page; record exceeded page by {recordAddress - endAddress} bytes"); + + if (logRecord.Info.Valid && logRecord.DataHeader.RecordHasObjects) + { + _ = logRecord.GetObjectLogRecordStartPositionAndLengths(out var keyLength, out var valueLength); + totalSize += keyLength + (long)valueLength; + } + } + + return totalSize; + } + + /// + /// Determine if this is the last valid record on the page. + /// + /// Address of the current record + /// Address of the end of the page + /// True if this is the last valid record on the page, otherwise false + private bool IsLastRecordOnPage(long recordAddress, long endAddress, out long nextRecordAddress) + { + while (recordAddress < endAddress) + { + var logRecord = new LogRecord(GetPhysicalAddress(recordAddress)); + var allocatedSize = logRecord.AllocatedSize; + if (allocatedSize <= 0) + ThrowTsavoriteException($"LogRecord size should be > 0; encountered {allocatedSize}"); + + recordAddress += allocatedSize; + if (recordAddress > endAddress) + ThrowTsavoriteException($"Unaligned end of page; record exceeded page by {recordAddress - endAddress} bytes"); + + if (logRecord.Info.Valid) + { + nextRecordAddress = recordAddress; + return false; + } + } + nextRecordAddress = -1L; + return true; + } + + /// + internal override void LoadObjectsForRecoveryPass2(long page, long fromAddress, long untilAddress, IDevice objectLogDevice) + { + var pageStartAddress = GetFirstValidLogicalAddressOnPage(page); + var address = Math.Max(fromAddress, pageStartAddress); + var endAddress = Math.Min(untilAddress, GetLogicalAddressOfStartOfPage(page + 1)); + if (address >= endAddress) + return; + + var pagePhysicalAddress = GetPhysicalAddress(GetLogicalAddressOfStartOfPage(page)); + var maxOffset = endAddress - GetLogicalAddressOfStartOfPage(page); + var objectIdMapToUse = objectPages[page % BufferSize].objectIdMap; + using var readBuffers = CreateCircularReadBuffers(objectLogDevice, logger); + DeserializeObjectsOnPage(pagePhysicalAddress, maxOffset, objectIdMapToUse, readBuffers); + } + + /// + internal override long FindHeadAddressCutoffOnPage(long page, long untilAddress, long totalPageObjectSize, int numPagesBelowCurrentPage, long remainingBudget, out int numPagesBelowToEvict) + { + var recordAddress = GetFirstValidLogicalAddressOnPage(page); + var stopAddress = Math.Min(untilAddress, GetLogicalAddressOfStartOfPage(page + 1)); + var overBudgetAmount = totalPageObjectSize - remainingBudget; + if (overBudgetAmount <= 0) + { + numPagesBelowToEvict = 0; + return recordAddress; + } + + // We are over budget. First see if we can evict enough pages to get below budget. + var pagesToEvictToGetUnderBudget = (int)((overBudgetAmount + PageSize - 1) / PageSize); + if (pagesToEvictToGetUnderBudget <= numPagesBelowCurrentPage) + { + // We can, and may even still have some pages left below us that can remain. + numPagesBelowToEvict = pagesToEvictToGetUnderBudget; + return recordAddress; + } + + // We cannot evict enough pages to get under budget. Evict all pages below, and then skip records on this page until we are under budget. + numPagesBelowToEvict = numPagesBelowCurrentPage; + overBudgetAmount -= (long)numPagesBelowToEvict * PageSize; + + while (recordAddress < stopAddress) + { + var logRecord = new LogRecord(GetPhysicalAddress(recordAddress)); + var allocatedSize = logRecord.AllocatedSize; + if (allocatedSize <= 0) + ThrowTsavoriteException($"LogRecord size should be > 0; encountered {allocatedSize}"); + + recordAddress += allocatedSize; + if (recordAddress > stopAddress) + ThrowTsavoriteException($"Unaligned end of page; record exceeded page by {recordAddress - stopAddress} bytes"); + + if (logRecord.Info.Valid && logRecord.DataHeader.RecordHasObjects) + { + _ = logRecord.GetObjectLogRecordStartPositionAndLengths(out var keyLength, out var valueLength); + overBudgetAmount -= keyLength + (long)valueLength; + if (overBudgetAmount <= 0) + return recordAddress; + } + } + + return stopAddress; } /// diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectIdMap.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectIdMap.cs index 468ac39ddd7..e544dc5a0ad 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectIdMap.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectIdMap.cs @@ -35,6 +35,8 @@ internal ObjectIdMap() internal int Count => objectArray.Count; + internal bool IsEmpty => objectArray.Count == 0; + /// Reserve a slot and return its ID. [MethodImpl(MethodImplOptions.AggressiveInlining)] public int Allocate() diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/ObjectLogWriter.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/ObjectLogWriter.cs index 94820531b07..ce2bf23fef4 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/ObjectLogWriter.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/ObjectLogWriter.cs @@ -109,6 +109,43 @@ public ulong WriteRecordObjects(in OverflowByteArray keyOverflow, in OverflowByt return valueObjectBytesWritten; } + /// + /// Copies bytes of a record's serialized object data verbatim from the snapshot object-log (via + /// ) into this (main) object-log, then signals record completion. Used by the snapshot-region recovery + /// flush, which copies a record's object bytes without deserialize/reserialize. The must already be + /// positioned at the record (via ). + /// + /// The reader over the snapshot object-log, positioned at the record to copy. + /// The total number of object-log bytes for the record (key plus value). + public void CopyRecoveredObjectBytes(ObjectLogReader reader, ulong totalLength) + { + if (totalLength > 0) + { + var buffer = flushBuffers.bufferPool.Get(IStreamBuffer.BufferSize); + try + { + var chunkSpan = buffer.TotalValidSpan; + var remaining = totalLength; + while (remaining > 0) + { + var requestLength = (int)Math.Min(remaining, (ulong)chunkSpan.Length); + var bytesRead = reader.Read(chunkSpan.Slice(0, requestLength)); + if (bytesRead == 0) + throw new TsavoriteException("Unexpected end of snapshot object-log data while copying objects during recovery"); + Write(chunkSpan.Slice(0, bytesRead)); + remaining -= (ulong)bytesRead; + } + } + finally + { + flushBuffers.bufferPool.Return(buffer); + } + } + + // Signal completion, as WriteRecordObjects does. + flushBuffers.OnRecordComplete(); + } + /// Start off the write using the full span of the . /// The to write. void WriteDirect(OverflowByteArray overflow) => WriteDirect(overflow, overflow.ReadOnlySpan, refCountedGCHandle: default); diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/SpanByteAllocator.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/SpanByteAllocator.cs index 893a1ef900d..41958e418e9 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/SpanByteAllocator.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/SpanByteAllocator.cs @@ -128,6 +128,9 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key) /// public readonly ObjectIdMap TransientObjectIdMap => default; + /// + public readonly ObjectIdMap GetPageObjectIdMap(long pageNumber) => default; + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public void OnDispose(ref LogRecord logRecord, DisposeReason disposeReason) => _this.OnDispose(ref logRecord, disposeReason); @@ -137,6 +140,6 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key) public void OnDisposeDiskRecord(ref DiskLogRecord logRecord, DisposeReason disposeReason) => _this.OnDisposeDiskRecord(ref logRecord, disposeReason); /// - public void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source) { } + public void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery) { } } } \ No newline at end of file diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/TsavoriteLogAllocator.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/TsavoriteLogAllocator.cs index e69b4ba783c..0fc40e4766f 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/TsavoriteLogAllocator.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/TsavoriteLogAllocator.cs @@ -127,6 +127,9 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key) /// public readonly ObjectIdMap TransientObjectIdMap => throw new NotImplementedException("Not implemented for TsavoriteLogAllocator"); + /// + public readonly ObjectIdMap GetPageObjectIdMap(long pageNumber) => throw new NotImplementedException("Not implemented for TsavoriteLogAllocator"); + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public void OnDispose(ref LogRecord logRecord, DisposeReason disposeReason) => throw new NotImplementedException("Not implemented for TsavoriteLogAllocator"); @@ -136,6 +139,6 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key) public void OnDisposeDiskRecord(ref DiskLogRecord logRecord, DisposeReason disposeReason) => throw new NotImplementedException("Not implemented for TsavoriteLogAllocator"); /// - public void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source) { } + public void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery) { } } } \ No newline at end of file diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSettings.cs b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSettings.cs index 8dc5c53897c..533156c8c67 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSettings.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSettings.cs @@ -25,6 +25,8 @@ internal class LogSettings /// for object serialization to the object log. public const int kMaxSegmentSizeBits = 62; + public const int kMinPageCount = 2; + /// Minimum number of bits for the size of the in-memory portion of the log public const int kMinMemorySizeBits = kMinPageSizeBits + 1; /// Maximum number of bits for the size of the in-memory portion of the log diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs index beaefe8cf1b..55ab03a5450 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs @@ -23,11 +23,14 @@ public class LogSizeTracker public static readonly int ResizeTaskDelaySeconds = 10; /// Target size must be at least this many pages; this gives us (at least a little) room for heap allocations in a minimum of - /// pages. - public const int MinTargetPageCount = MinResizeTargetPageCount * 2; + /// pages. + public const int MinTargetPageCount = LogSettings.kMinPageCount * 2; - /// When resizing we must preserve at least this many pages - public const int MinResizeTargetPageCount = 2; + /// + /// When evicting, do not allow HeadAddress to advance to within this many bytes of TailAddress. This usually allows more than one usable record in + /// the database. If there are records with objects in that range that exceed the memory budget, then the memory budget should be adjusted to allow for it. + /// + public const int MinEvictionHeadAddressLag = 4096; } /// Tracks and controls size of log @@ -85,11 +88,15 @@ enum RunState : int { NotStarted, Running, StopRequested, Stopped }; public override string ToString() { return $"{runState}; TargetSize: [{TargetSize}, hi: {highTargetSize}, lo: {lowTargetSize}]; TotalSize: [{TotalSize}, Heap: {heapSize.Total}];" - + $" isOver: [{IsBeyondSizeLimit}, canEvict {IsBeyondSizeLimitAndCanEvict}]; AllocPgCt: {logAccessor.AllocatedPageCount}; PgSize {logAccessor.allocatorBase.PageSize}"; + + $" isOver: [{IsOverBudget}, canEvict {IsBeyondSizeLimitAndCanEvict}]; AllocPgCt: {logAccessor.AllocatedPageCount}; PgSize {logAccessor.allocatorBase.PageSize}"; } + /// Returns the memory budget we have remaining + /// May return a negative value if already over budget. + public long RemainingBudget => highTargetSize - TotalSize; + /// Return true if the total size is outside the target plus delta - public bool IsBeyondSizeLimit => TotalSize > highTargetSize; + public bool IsOverBudget => TotalSize > highTargetSize; /// Return true if the total size is outside the target plus delta *and* we have pages we can (partially or completely) evict /// If true, we are allocating a new page. Otherwise, we are called when adding or growing a new @@ -105,15 +112,12 @@ public bool IsBeyondSizeLimitAndCanEvict(bool addingPage = false) if (addingPage && numPages == logAccessor.allocatorBase.MaxAllocatedPageCount) return true; - // Otherwise, we need at least MinResizeTargetPageCount to be able to evict anything. - return (TotalSize > highTargetSize) && numPages > MinResizeTargetPageCount; + // Otherwise, we need at least MinEvictionHeadAddressLag to be able to evict anything. Use UnstableGetTailAddress (as above): this is + // reached from HandlePageOverflow on the thread that owns tail-address stabilization, and the stable GetTailAddress() would spin-wait + // forever for a TailPageOffset that only this same thread can reset (after NeedToWaitForClose returns). + return (TotalSize > highTargetSize) && logAccessor.allocatorBase.UnstableGetTailAddress(out _) - logAccessor.allocatorBase.HeadAddress >= MinEvictionHeadAddressLag; } - /// Return true if the total size plus the size needed for the requested number of pages to read is outside the target plus delta *and* - /// we have pages we can (partially or completely) evict - /// This is called by Recovery. - public bool IsBeyondSizeLimitToReadPages(int numPagesToRead) => TotalSize + (numPagesToRead * logAccessor.allocatorBase.PageSize) > highTargetSize; - /// Creates a new log size tracker /// Hybrid log accessor /// Target size for the hybrid log memory utilization @@ -267,77 +271,107 @@ private bool DetermineEvictionRange(long currentSize, CancellationToken cancella ref int allocatedPageCount, out long estimatedHeapTrimmedSize) { // We know we are oversize so we calculate how much we need to trim to get to lowTargetSize. - var overSize = currentSize - lowTargetSize; + var overBudgetAmount = currentSize - lowTargetSize; estimatedHeapTrimmedSize = 0L; var allocator = logAccessor.allocatorBase; headAddress = allocator.HeadAddress; - var headPage = allocator.GetPage(headAddress); - var untilAddress = allocator.UnstableGetTailAddress(out _); - var untilPage = allocator.GetPage(untilAddress); - - // The number of pages we have is untilPage - headPage + 1. - if (untilPage - headPage + 1 <= MinResizeTargetPageCount) - return false; - untilAddress = allocator.GetLogicalAddressOfStartOfPage(untilPage - MinResizeTargetPageCount + 1); + var startingHeadPage = allocator.GetPage(headAddress); + var maxEvictUntilAddress = allocator.UnstableGetTailAddress(out _) - MinEvictionHeadAddressLag; + var maxEvictUntilPage = allocator.GetPage(maxEvictUntilAddress); - // If there is nothing to trim from the heap, we can just do math to advance HA. + // If there is nothing to trim from the heap, we just do math to trim as many pages as we need to (up to the limit). if (heapSize.Total == 0) { - var evictableSize = untilAddress - headAddress; - var isComplete = overSize <= evictableSize; - if (!isComplete) - overSize = evictableSize; - headAddress = RoundUp(headAddress + overSize, Constants.kRecordAlignment); - - // Scan from head of page to snap headAddress to the next record boundary. - var pageIndex = allocator.GetPage(headAddress); - var pageStartAddress = allocator.GetLogicalAddressOfStartOfPage(pageIndex); - var offset = headAddress - pageStartAddress; - if (offset <= PageHeader.Size) - headAddress = pageStartAddress; - else + // We are evicting in units of pages, so we set this to the start of the maxEvictUntilPage. + maxEvictUntilAddress = allocator.GetLogicalAddressOfStartOfPage(maxEvictUntilPage); + var evictableSize = maxEvictUntilAddress - headAddress; + + // evictableSize is the resident span [headAddress, tail-aligned). When heapSize is 0, TotalSize == AllocatedPageCount * PageSize, so being + // over budget here means AllocatedPageCount * PageSize > budget; recovery keeps AllocatedPageCount within MaxAllocatedPageCount (the read + // batch is capped at the budget and a final trim evicts any object-free overage), so AllocatedPageCount ~= the resident page count and that + // resident span must itself exceed the budget => evictableSize > 0. A negative value would mean AllocatedPageCount exceeds the resident set + // (stale pages left allocated below headAddress), which we must not reach. + Debug.Assert(evictableSize >= 0, $"evictableSize ({evictableSize}) must be non-negative; AllocatedPageCount exceeds the resident set below headAddress."); + + var margin = evictableSize - overBudgetAmount; + var isComplete = margin > 0; + if (isComplete) { - var currentAddress = pageStartAddress + PageHeader.Size; - var physicalAddress = allocator.GetPhysicalAddress(currentAddress); - while (currentAddress < headAddress) - { - var allocatedSize = new LogRecord(physicalAddress).AllocatedSize; - currentAddress += allocatedSize; - physicalAddress += allocatedSize; - } + // We can completely satisfy the over-budget amount, so we can add some pages back to keep more below maxEvictUntilPage. + var additionalPagesToKeep = margin / allocator.PageSize; + maxEvictUntilPage -= additionalPagesToKeep; } - allocatedPageCount -= (int)(allocator.GetPage(headAddress) - headPage); + // We'll evict the maxEvictUntilPage so start at the first valid logical address on the next page. + headAddress = allocator.GetFirstValidLogicalAddressOnPage(maxEvictUntilPage); + + allocatedPageCount -= (int)(maxEvictUntilPage - startingHeadPage); return isComplete; } - // This will iterate until iterator.CurrentAddress == untilAddress - using var iterator = logAccessor.Scan(headAddress, untilAddress); - allocatedPageCount = allocator.AllocatedPageCount; + // We have heap objects we can potentially evict. This will iterate until iterator.CurrentAddress == untilAddress. + // To optimize performance, iterate pages and skip the whole page if objectIdMap.IsEmpty, else enumerate records on the page. var pageTrimmedSize = 0L; - while (estimatedHeapTrimmedSize + pageTrimmedSize < overSize && iterator.GetNext() && !IsStopped) + var lastEvictPage = allocator.GetPage(maxEvictUntilAddress); + for (var currentPage = startingHeadPage; currentPage <= lastEvictPage && estimatedHeapTrimmedSize + pageTrimmedSize < overBudgetAmount && !IsStopped; currentPage++) { cancellationToken.ThrowIfCancellationRequested(); - estimatedHeapTrimmedSize += iterator.CalculateHeapMemorySize(); - // If we've crossed a page boundary, we can subtract the pagesize as well. - var currentPage = allocator.GetPage(iterator.CurrentAddress); - if (currentPage > headPage) + if (currentPage != startingHeadPage) + headAddress = allocator.GetFirstValidLogicalAddressOnPage(currentPage); + + // If there are no objects on this page and it's below maxEvictUntilPage (which may not be able to be evicted fully), + // we can skip the whole page and just subtract the pagesize from the amount we need to trim. + if (currentPage < maxEvictUntilPage) + { + var oidMap = allocator._wrapper.GetPageObjectIdMap(currentPage); + if (oidMap is null || oidMap.Count == 0) + { + pageTrimmedSize += allocator.PageSize; + if (estimatedHeapTrimmedSize + pageTrimmedSize >= overBudgetAmount) + { + // Set headAddress to the start of the next page and we're done. + headAddress = allocator.GetFirstValidLogicalAddressOnPage(currentPage + 1); + break; + } + continue; + } + } + + // We have objects, so iterate records to see where the new headAddress must be. Don't go past maxEvictUntilAddress. + var endAddress = allocator.GetLogicalAddressOfStartOfPage(currentPage + 1); + if (endAddress > maxEvictUntilAddress) + endAddress = maxEvictUntilAddress; + while (headAddress < endAddress) + { + var logRecord = allocator._wrapper.CreateLogRecord(headAddress); + var allocatedSize = logRecord.AllocatedSize; + if (allocatedSize <= 0) + ThrowTsavoriteException($"LogRecord size should be > 0; encountered {allocatedSize}"); + + headAddress += allocatedSize; + if (!logRecord.Info.Valid) + continue; + + estimatedHeapTrimmedSize += logRecord.CalculateHeapMemorySize(); + if (estimatedHeapTrimmedSize + pageTrimmedSize >= overBudgetAmount) + break; + } + + // If we have finished a page, add its size to our eviction total and set headAddress to the start of the next page. + if (headAddress >= endAddress) { - headPage = currentPage; - --allocatedPageCount; pageTrimmedSize += allocator.PageSize; + headAddress = allocator.GetFirstValidLogicalAddressOnPage(currentPage + 1); } - } - // iterator.NextAddress is the end of the last-processed record; if we did not advance far enough to clear all the oversize space - // it is the start of the next record we would have processed (and probably equal to untilAddress). In both cases it is how far we - // can evict to, and because it is the next address we've not yet evaluated whether it's crossed the page boundary; do that here. - headAddress = iterator.NextAddress; + if (estimatedHeapTrimmedSize + pageTrimmedSize >= overBudgetAmount) + break; + } - // Return whether we could satisfy the resize request; for Recovery, we may need to wait on flush. - return estimatedHeapTrimmedSize + pageTrimmedSize >= overSize; + // headAddress is now properly set. Return whether we could satisfy the resize request; for Recovery, we may need to wait on flush. + return estimatedHeapTrimmedSize + pageTrimmedSize >= overBudgetAmount; } /// @@ -353,13 +387,16 @@ private void ResizeIfNeeded(CancellationToken cancellationToken) long headAddress, estimatedHeapTrimmedSize, readOnlyAddress; var isComplete = false; - var allocatedPageCount = logAccessor.AllocatedPageCount; - logger?.LogDebug("Heap size {totalLogSize} > target {highTargetSize}. Alloc: {AllocatedPageCount} BufferSize: {BufferSize}", heapSize.Total, highTargetSize, allocatedPageCount, logAccessor.BufferSize); + int allocatedPageCount; // Acquire the epoch long enough to calculate eviction ranges. logAccessor.allocatorBase.epoch.Resume(); try { + // AllocatedPageCount is set here, after we've resumed the epoch (which may have done eviction). + allocatedPageCount = logAccessor.AllocatedPageCount; + logger?.LogDebug("Heap size {totalLogSize} > target {highTargetSize}. Alloc: {AllocatedPageCount} BufferSize: {BufferSize}", heapSize.Total, highTargetSize, allocatedPageCount, logAccessor.BufferSize); + // See how much we can evict from HeadAddress onwards. Ignore the return value that indicates whether this is complete; // we calculate the new ROA up to MinTargetPageCount pages before TailAddress, and that's as far as we can go. isComplete = DetermineEvictionRange(currentSize, cancellationToken, out headAddress, ref allocatedPageCount, out estimatedHeapTrimmedSize); diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Recovery/IndexRecovery.cs b/libs/storage/Tsavorite/cs/src/core/Index/Recovery/IndexRecovery.cs index fed7c5ab07b..f75200046f2 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Recovery/IndexRecovery.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Recovery/IndexRecovery.cs @@ -21,16 +21,6 @@ public partial class TsavoriteBase public ICheckpointManager CheckpointManager => checkpointManager; // Derived class exposed API - internal void RecoverFuzzyIndex(IndexCheckpointInfo info) - { - ulong alignedIndexSize = InitializeMainIndexRecovery(ref info, isAsync: false); - overflowBucketsAllocator.Recover(info.main_ht_device, alignedIndexSize, info.info.num_buckets, info.info.num_ofb_bytes); - - // Wait until reading is complete - IsFuzzyIndexRecoveryComplete(true); - FinalizeMainIndexRecovery(info); - } - internal async ValueTask RecoverFuzzyIndexAsync(IndexCheckpointInfo info, CancellationToken cancellationToken) { ulong alignedIndexSize = InitializeMainIndexRecovery(ref info, isAsync: true); @@ -69,15 +59,6 @@ private void FinalizeMainIndexRecovery(IndexCheckpointInfo info) DeleteTentativeEntries(); } - // Test-only - internal void RecoverFuzzyIndex(int ht_version, IDevice device, ulong num_ht_bytes, IDevice ofbdevice, int num_buckets, ulong num_ofb_bytes) - { - BeginMainIndexRecovery(ht_version, device, num_ht_bytes); - var sectorSize = device.SectorSize; - var alignedIndexSize = (num_ht_bytes + (sectorSize - 1)) & ~((ulong)sectorSize - 1); - overflowBucketsAllocator.Recover(ofbdevice, alignedIndexSize, num_buckets, num_ofb_bytes); - } - // Test-only internal async ValueTask RecoverFuzzyIndexAsync(int ht_version, IDevice device, ulong num_ht_bytes, IDevice ofbdevice, int num_buckets, ulong num_ofb_bytes, CancellationToken cancellationToken) { @@ -88,13 +69,6 @@ internal async ValueTask RecoverFuzzyIndexAsync(int ht_version, IDevice device, await overflowBucketsAllocator.RecoverAsync(ofbdevice, alignedIndexSize, num_buckets, num_ofb_bytes, cancellationToken).ConfigureAwait(false); } - internal bool IsFuzzyIndexRecoveryComplete(bool waitUntilComplete = false) - { - bool completed1 = IsMainIndexRecoveryCompleted(waitUntilComplete); - bool completed2 = overflowBucketsAllocator.IsRecoveryCompleted(waitUntilComplete); - return completed1 && completed2; - } - /// /// Main Index Recovery Functions /// @@ -131,17 +105,6 @@ private unsafe void BeginMainIndexRecovery( Debug.Assert(numBytesRead == num_bytes); } - private bool IsMainIndexRecoveryCompleted(bool waitUntilComplete = false) - { - bool completed = recoveryCountdown.IsCompleted; - if (!completed && waitUntilComplete) - { - recoveryCountdown.Wait(); - return true; - } - return completed; - } - private unsafe void AsyncPageReadCallback(uint errorCode, uint numBytes, object overlap) { if (errorCode != 0) diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Recovery/Recovery.cs b/libs/storage/Tsavorite/cs/src/core/Index/Recovery/Recovery.cs index 2939f1b19cf..10e04450417 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Recovery/Recovery.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Recovery/Recovery.cs @@ -25,6 +25,9 @@ internal sealed class RecoveryStatus /// Object log recovery device, obtained from CheckpointManager. public IDevice objectLogRecoveryDevice; + /// The current head address; updated as pages are evicted during recovery. + public long headAddress; + /// Circular status buffer of 'capacity' size; the indexing wraps per hlog.GetPageIndexForPage(). public ReadStatus[] readStatus; /// Circular status buffer of 'capacity' size; the indexing wraps per hlog.GetPageIndexForPage(). @@ -65,7 +68,7 @@ internal void WaitRead(int pageIndex) while (readStatus[pageIndex] == ReadStatus.Pending) readSemaphore.Wait(); if (readStatus[pageIndex] == ReadStatus.Error) - throw new TsavoriteException($"Error reading page {pageIndex} from device"); + ThrowTsavoriteException($"Error reading page {pageIndex} from device"); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -74,7 +77,7 @@ internal async ValueTask WaitReadAsync(int pageIndex, CancellationToken cancella while (readStatus[pageIndex] == ReadStatus.Pending) await readSemaphore.WaitAsync(cancellationToken).ConfigureAwait(false); if (readStatus[pageIndex] == ReadStatus.Error) - throw new TsavoriteException($"Error reading page {pageIndex} from device"); + ThrowTsavoriteException($"Error reading page {pageIndex} from device"); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -117,15 +120,13 @@ internal void Dispose() } } - internal struct RecoveryOptions + internal readonly struct RecoveryOptions { - internal long headAddress; - internal long fuzzyRegionStartAddress; - internal bool undoNextVersion; + internal readonly long fuzzyRegionStartAddress; + internal readonly bool undoNextVersion; - internal RecoveryOptions(long headAddress, long fuzzyRegionStartAddress, bool undoNextVersion) + internal RecoveryOptions(long fuzzyRegionStartAddress, bool undoNextVersion) { - this.headAddress = headAddress; this.fuzzyRegionStartAddress = fuzzyRegionStartAddress; this.undoNextVersion = undoNextVersion; } @@ -162,8 +163,6 @@ public partial class TsavoriteKV : TsavoriteBase where TStoreFunctions : IStoreFunctions where TAllocator : IAllocator { - private const long NoPageFreed = -1; - /// /// GetLatestCheckpointTokens /// @@ -267,11 +266,7 @@ public long GetIndexFileSize(Guid token) return (long)(recoveredICInfo.info.num_ht_bytes + recoveredICInfo.info.num_ofb_bytes); } - private void GetClosestHybridLogCheckpointInfo( - long requestedVersion, - out Guid closestToken, - out HybridLogCheckpointInfo closest, - out byte[] cookie) + private void GetClosestHybridLogCheckpointInfo(long requestedVersion, out Guid closestToken, out HybridLogCheckpointInfo closest, out byte[] cookie) { HybridLogCheckpointInfo current; var closestVersion = long.MaxValue; @@ -289,11 +284,13 @@ private void GetClosestHybridLogCheckpointInfo( current = new HybridLogCheckpointInfo(); current.Recover(hybridLogToken, checkpointManager, out var currCookie); var distanceToTarget = (requestedVersion == -1 ? long.MaxValue : requestedVersion) - current.info.version; + // This is larger than intended version, cannot recover to this. - if (distanceToTarget < 0) continue; - // We have found the exact version to recover to --- the above conditional establishes that the - // checkpointed version is <= requested version, and if next version is larger than requestedVersion, - // there cannot be any closer version. + if (distanceToTarget < 0) + continue; + + // We have found the exact version to recover to: the above conditional establishes that the checkpointed version is <= requested version, + // and if nextVersion is larger than requestedVersion, there cannot be any closer version. if (current.info.nextVersion > requestedVersion) { closest = current; @@ -421,67 +418,6 @@ public void Reset() lastVersion = 0; } - /// Synchronous recovery driver - private long InternalRecover(Guid indexToken, Guid hybridLogToken, int numPagesToPreload, bool undoNextVersion) - { - GetRecoveryInfo(indexToken, hybridLogToken, out var recoveredHLCInfo, out var recoveredICInfo); - return InternalRecover(recoveredICInfo, recoveredHLCInfo, numPagesToPreload, undoNextVersion); - } - - /// Synchronous recovery driver - private long InternalRecover(IndexCheckpointInfo recoveredICInfo, HybridLogCheckpointInfo recoveredHLCInfo, int numPagesToPreload, bool undoNextVersion) - { - hlogBase.VerifyRecoveryInfo(recoveredHLCInfo, false); - - if (hlogBase.GetTailAddress() > hlogBase.GetFirstValidLogicalAddressOnPage(0)) - { - logger?.LogInformation("Recovery called on non-empty log - resetting to empty state first. Make sure store is quiesced before calling Recover on a running store."); - Reset(); - } - - if (!GetInitialRecoveryAddress(recoveredICInfo, recoveredHLCInfo, out long recoverFromAddress)) - RecoverFuzzyIndex(recoveredICInfo); - - if (!SetRecoveryPageRanges(recoveredHLCInfo, numPagesToPreload, recoverFromAddress, out long tailAddress, out long headAddress, out long scanFromAddress)) - return -1; - RecoveryOptions options = new(headAddress, fuzzyRegionStartAddress: recoveredHLCInfo.info.startLogicalAddress, undoNextVersion); - - // Make index consistent for version v - long readOnlyAddress, lastFreedPage; - if (recoveredHLCInfo.info.useSnapshotFile == 0) - { - lastFreedPage = RecoverHybridLog(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress, - recoveredHLCInfo.info.nextVersion, CheckpointType.FoldOver, options); - - readOnlyAddress = tailAddress; - } - else - { - if (recoveredHLCInfo.info.flushedLogicalAddress < headAddress) - headAddress = recoveredHLCInfo.info.flushedLogicalAddress; - - // First recover from index starting point (fromAddress) to snapshot starting point (flushedLogicalAddress taken at PERSISTENCE_CALLBACK, so it includes - // any flushes to the hybrid log files due to OnPagesMarkedReadOnly while we were flushing to the snapshot files). - lastFreedPage = RecoverHybridLog(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.flushedLogicalAddress, - recoveredHLCInfo.info.nextVersion, CheckpointType.Snapshot, options); - - // Then recover snapshot into mutable region. Note that the ObjectAllocator will not write object log records for the mutable region; - // that only happens during flushes due to OnPagesMarkedReadOnly. - var snapshotLastFreedPage = RecoverHybridLogFromSnapshotFile(scanFromAddress: recoveredHLCInfo.info.flushedLogicalAddress, - recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress, - snapshotStartAddress: recoveredHLCInfo.info.snapshotStartFlushedLogicalAddress, snapshotEndAddress: recoveredHLCInfo.info.snapshotFinalLogicalAddress, - recoveredHLCInfo.info.nextVersion, recoveredHLCInfo.info.guid, options); - - if (snapshotLastFreedPage != NoPageFreed) - lastFreedPage = snapshotLastFreedPage; - - readOnlyAddress = recoveredHLCInfo.info.flushedLogicalAddress; - } - - DoPostRecovery(recoveredICInfo, recoveredHLCInfo, tailAddress, ref headAddress, ref readOnlyAddress, lastFreedPage); - return recoveredHLCInfo.info.version; - } - /// Aynchronous recovery driver private ValueTask InternalRecoverAsync(Guid indexToken, Guid hybridLogToken, int numPagesToPreload, bool undoNextVersion, CancellationToken cancellationToken) { @@ -505,15 +441,20 @@ private async ValueTask InternalRecoverAsync(IndexCheckpointInfo recovered if (!SetRecoveryPageRanges(recoveredHLCInfo, numPagesToPreload, recoverFromAddress, out long tailAddress, out long headAddress, out long scanFromAddress)) return -1; - RecoveryOptions options = new(headAddress, fuzzyRegionStartAddress: recoveredHLCInfo.info.startLogicalAddress, undoNextVersion); + RecoveryOptions options = new(fuzzyRegionStartAddress: recoveredHLCInfo.info.startLogicalAddress, undoNextVersion); // Make index consistent for version v - long readOnlyAddress, lastFreedPage; + long readOnlyAddress; + long finalHeadAddress; + RecoveryStatus recoveryStatus; if (recoveredHLCInfo.info.useSnapshotFile == 0) { - lastFreedPage = await RecoverHybridLogAsync(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress, - recoveredHLCInfo.info.nextVersion, CheckpointType.FoldOver, options, cancellationToken).ConfigureAwait(false); + recoveryStatus = await RecoverHybridLogAsync(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress, + recoveredHLCInfo.info.nextVersion, CheckpointType.FoldOver, headAddress, options, cancellationToken).ConfigureAwait(false); + // FoldOver objects are already durable in the main object-log; set the tail to its end so subsequent writes append after it. + hlogBase.SetObjectLogTail(recoveredHLCInfo.info.hlogEndObjectLogTail); + finalHeadAddress = recoveryStatus.headAddress; readOnlyAddress = tailAddress; } else @@ -522,48 +463,40 @@ private async ValueTask InternalRecoverAsync(IndexCheckpointInfo recovered headAddress = recoveredHLCInfo.info.flushedLogicalAddress; // First recover from index starting point (fromAddress) to snapshot starting point (flushedLogicalAddress taken at PERSISTENCE_CALLBACK, so it includes - // any flushes to the hybrid log files due to OnPagesMarkedReadOnly while we were flushing to the snapshot files). - lastFreedPage = await RecoverHybridLogAsync(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.flushedLogicalAddress, - recoveredHLCInfo.info.nextVersion, CheckpointType.Snapshot, - new RecoveryOptions(headAddress, fuzzyRegionStartAddress: recoveredHLCInfo.info.startLogicalAddress, undoNextVersion), cancellationToken).ConfigureAwait(false); - - // Then recover snapshot into mutable region. Note that the ObjectAllocator will not write object log records for the mutable region; - // that only happens during flushes due to OnPagesMarkedReadOnly. - var snapshotLastFreedPage = await RecoverHybridLogFromSnapshotFileAsync(scanFromAddress: recoveredHLCInfo.info.flushedLogicalAddress, + // any flushes to the hybrid log files due to OnPagesMarkedReadOnly while we were flushing to the snapshot files). Object loading is deferred (see below). + recoveryStatus = await RecoverHybridLogAsync(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.flushedLogicalAddress, + recoveredHLCInfo.info.nextVersion, CheckpointType.Snapshot, headAddress, options, cancellationToken).ConfigureAwait(false); + + // Initialize the main object-log tail to the end of the hybrid-log objects BEFORE recovering the snapshot pages: the snapshot-region flushes copy + // each record's objects from the snapshot object-log into the main object-log starting here, advancing the tail (via OnPartialFlushComplete) as they go. + // This must happen after the hybrid-log phase (which runs with the tail unset, like before) and before the snapshot phase. + hlogBase.SetObjectLogTail(recoveredHLCInfo.info.hlogEndObjectLogTail); + + // Then recover snapshot into mutable region. The snapshot-region pages are read (without their objects), flushed to the main log with their objects + // copied into the main object-log (so they are durable and can be evicted into a smaller memory budget), and then objects are loaded once over the full + // recovered range (both the hybrid-log and snapshot regions), honoring the final headAddress. + finalHeadAddress = await RecoverHybridLogFromSnapshotFileAsync(scanFromAddress: recoveredHLCInfo.info.flushedLogicalAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress, snapshotStartAddress: recoveredHLCInfo.info.snapshotStartFlushedLogicalAddress, snapshotEndAddress: recoveredHLCInfo.info.snapshotFinalLogicalAddress, - recoveredHLCInfo.info.nextVersion, recoveredHLCInfo.info.guid, options, cancellationToken).ConfigureAwait(false); - - if (snapshotLastFreedPage != NoPageFreed) - lastFreedPage = snapshotLastFreedPage; + recoveredHLCInfo.info.nextVersion, recoveredHLCInfo.info.guid, headAddress: recoveryStatus.headAddress, + options, cancellationToken).ConfigureAwait(false); readOnlyAddress = recoveredHLCInfo.info.flushedLogicalAddress; } - DoPostRecovery(recoveredICInfo, recoveredHLCInfo, tailAddress, ref headAddress, ref readOnlyAddress, lastFreedPage); + DoPostRecovery(recoveredICInfo, recoveredHLCInfo, tailAddress, finalHeadAddress, readOnlyAddress); return recoveredHLCInfo.info.version; } - private void DoPostRecovery(IndexCheckpointInfo recoveredICInfo, HybridLogCheckpointInfo recoveredHLCInfo, long tailAddress, ref long headAddress, ref long readOnlyAddress, long lastFreedPage) + private void DoPostRecovery(IndexCheckpointInfo recoveredICInfo, HybridLogCheckpointInfo recoveredHLCInfo, long tailAddress, long headAddress, long readOnlyAddress) { - // Adjust head and read-only address post-recovery - var _head = hlogBase.GetFirstValidLogicalAddressOnPage(1 + hlogBase.GetPage(tailAddress) - hlogBase.MaxAllocatedPageCount); - - // If additional pages have been freed to accommodate memory constraints, adjust head address accordingly - if (lastFreedPage != NoPageFreed) - { - var nextAddress = hlogBase.GetFirstValidLogicalAddressOnPage(lastFreedPage + 1); - if (_head < nextAddress) - _head = nextAddress; - } - - if (_head > headAddress) - headAddress = _head; + // HeadAddress has already been adjusted for any evictions but make sure we are below any existing HeadAddress in the log. + if (headAddress < hlogBase.HeadAddress) + headAddress = hlogBase.HeadAddress; if (readOnlyAddress < headAddress) readOnlyAddress = headAddress; hlogBase.RecoveryReset(tailAddress, headAddress, recoveredHLCInfo.info.beginAddress, readOnlyAddress); - hlogBase.SetObjectLogTail(recoveredHLCInfo.info.hlogEndObjectLogTail); checkpointManager.OnRecovery(recoveredICInfo.info.token, recoveredHLCInfo.info.guid); recoveredHLCInfo.Dispose(); } @@ -573,10 +506,7 @@ private void DoPostRecovery(IndexCheckpointInfo recoveredICInfo, HybridLogCheckp /// Warning: use only when the system is not taking a checkpoint. /// /// Version to set the store to - public void SetVersion(long version) - { - stateMachineDriver.SetSystemState(SystemState.Make(Phase.REST, version)); - } + public void SetVersion(long version) => stateMachineDriver.SetSystemState(SystemState.Make(Phase.REST, version)); /// /// Compute recovery address and determine where to recover from @@ -657,10 +587,10 @@ private bool SetRecoveryPageRanges(HybridLogCheckpointInfo recoveredHLCInfo, int return true; } - private long ReadPagesWithMemoryConstraint(long endAddress, RecoveryStatus recoveryStatus, long page, long endPage, int numPagesToRead) + private void ReadPagesWithMemoryConstraint(long endAddress, RecoveryStatus recoveryStatus, long page, long endPage, int numPagesToRead) { - // Before reading in additional pages, trim memory if needed to make room for the inline space (we can't know the heap size yet) - var freedPage = TrimLogMemorySize(recoveryStatus, tailPage: page, numPagesToRead); + // Before reading in additional pages, trim memory if needed to make room for the inline page space. + TrimLogPages(recoveryStatus, tailPage: page, numPagesToRead, untilAddress: endAddress); // Set all page read statuses to Pending for (var p = page; p < endPage; p++) @@ -668,130 +598,93 @@ private long ReadPagesWithMemoryConstraint(long endAddress, RecoveryStatus recov // Issue request to read pages as much as possible hlogBase.AsyncReadPagesForRecovery(page, numPagesToRead, endAddress, recoveryStatus, recoveryStatus.recoveryDevicePageOffset, - recoveryStatus.recoveryDevice, recoveryStatus.objectLogRecoveryDevice); - return freedPage; + recoveryStatus.recoveryDevice, recoveryStatus.objectLogRecoveryDevice, RecoveryPhase.Pass1); } - /// - /// Called before 'pagesToRead' number of pages are read into memory, this method determines how many previously allocated pages - /// must be (partially or completely) freed to avoid the total memory size to go beyond the specified maximum during recovery. - /// - /// True if is nonzero, else false - private bool GetEvictionPageRange(long tailPage, int numPagesToRead, CancellationToken cancellationToken, out long startPage, out int minEvictPageCount, out int maxEvictPageCount) + private void TrimLogPages(RecoveryStatus recoveryStatus, long tailPage, int numPagesToRead, long untilAddress) { - // The caller will iterate from startPage to endPage, so we use that as the basis for our eviction counts (which will start evicting at startPage). - // tailPage is the leading page index and start/endPage are the trailing page indexes: startPage is at the start of a full buffer of pages, - // and endPage is the start of the "usable" buffer capacity (the amount of pages we can actually use within the hlogBase.MaxAllocatedPageCount - // constraint) PLUS the number of pages to read. If hlogBase.MaxAllocatedPageCount is less than hlogBase.BufferSize, the the calling - // TrimLogMemorySize will probably be iterating over freed (non-allocated) pages from startPage to (endPage - numPagesToRead), and then - // will start actually evicting pages. NOTE: Currently numPagesToRead is always 1, but we may be able to optimize that in the future. - startPage = Math.Max(0, tailPage - hlogBase.BufferSize); - var endPage = Math.Max(0, tailPage - hlogBase.MaxAllocatedPageCount + numPagesToRead); - - // TODO: Currently Recovery is still page-level eviction only. hlogBase.HeadAddress etc. are not yet set so we will have to propagate - // the new headAddress back up the path we currently pass the lastFreedPage. - - // MinEvictPageCount is the number of pages we must clear so we can read numPagesToRead without violating the maximum page count constraint. - minEvictPageCount = Math.Max(0, (int)(endPage - startPage)); - maxEvictPageCount = minEvictPageCount; - if (endPage <= startPage) - return false; - - // If no log size tracker, just ensure MaxPageCount is not exceeded. if (hlogBase.logSizeTracker is null) - return minEvictPageCount > 0; + return; - // We have a log size tracker, so set minEvictPageCount to zero and maxEvictPageCount to the maximum number of pages we can evict; - // the caller will also test logSizeTracker.IsBeyondSizeLimitToReadPages during the eviction loop and jump out if it drops within budget. - maxEvictPageCount = Math.Max(minEvictPageCount, (int)(tailPage - startPage) - LogSizeTracker.MinResizeTargetPageCount); - return minEvictPageCount > 0 || hlogBase.logSizeTracker.IsBeyondSizeLimitToReadPages(numPagesToRead); - } + var headPage = hlogBase.GetPage(recoveryStatus.headAddress); + var loadedPages = tailPage - headPage + 1; + var totalPagesNeeded = loadedPages + numPagesToRead; - private long TrimLogMemorySize(RecoveryStatus recoveryStatus, long tailPage, int numPagesToRead) - { - var lastFreedPage = NoPageFreed; - if (GetEvictionPageRange(tailPage, numPagesToRead, cancellationToken: default, out long startPage, out int minEvictPageCount, out int maxEvictPageCount)) + // Respect the usual MinEvictionHeadAddressLag tail lag. Snapshot pages are made durable (objects copied to the main object-log) by + // RecoverSnapshotPages before they can be evicted here, so read-time eviction is free to evict any page to honor the memory budget. + var maxHeadAddress = untilAddress - LogSizeTracker.MinEvictionHeadAddressLag; + + // Evict pages from headAddress upward while over budget, respecting MinEvictionHeadAddressLag. This is during Pass1, + // so there are no objects to evict; we're evicting a full page each iteration. + while (totalPagesNeeded > 1 + && hlogBase.logSizeTracker.RemainingBudget < numPagesToRead * hlogBase.PageSize + && recoveryStatus.headAddress < maxHeadAddress) { - // Evict pages one at a time - for (var ii = 0; ii < maxEvictPageCount; ii++) + var pageIndex = hlogBase.GetPageIndexForPage(headPage); + if (hlogBase.IsAllocated(pageIndex)) { - if (hlogBase.logSizeTracker is not null && ii >= minEvictPageCount && !hlogBase.logSizeTracker.IsBeyondSizeLimitToReadPages(numPagesToRead)) - break; - var page = startPage + ii; - var pageIndex = hlogBase.GetPageIndexForPage(page); - if (hlogBase.IsAllocated(pageIndex)) - { - recoveryStatus.WaitFlush(pageIndex); - hlogBase.EvictPageForRecovery(page); - lastFreedPage = page; - } + recoveryStatus.WaitFlush(pageIndex); + hlogBase.EvictPageForRecovery(headPage); } + headPage++; + recoveryStatus.headAddress = hlogBase.GetFirstValidLogicalAddressOnPage(headPage); + if (recoveryStatus.headAddress > maxHeadAddress) + { + recoveryStatus.headAddress = maxHeadAddress; + break; + } + totalPagesNeeded--; } - - return lastFreedPage; } - private async Task TrimLogMemorySizeAsync(RecoveryStatus recoveryStatus, long tailPage, int numPagesToRead, CancellationToken cancellationToken = default) + /// + /// After the recovery read loop and deferred object load, evict object-free resident pages from headAddress upward until AllocatedPageCount is within + /// , respecting . + /// The per-batch reserves room for each upcoming read against the delta-padded highTargetSize budget and does not run after + /// the final batch, so an object-free (inline) store can settle one page above the hard MaxAllocatedPageCount cap. Object-free pages are durable on the + /// main log (re-read on demand); the walk stops at the first page with live objects, whose budget is governed by 's + /// heap-aware eviction. Dead pages below startPage (from store initialization) are freed up front in RecoverHybridLogAsync, so AllocatedPageCount here + /// reflects only resident data and this budget walk is accurate. + /// + private void TrimResidentPagesToBudget(RecoveryStatus recoveryStatus, long untilAddress) { - var lastFreedPage = NoPageFreed; - if (GetEvictionPageRange(tailPage, numPagesToRead, cancellationToken: default, out long startPage, out int minEvictPageCount, out int maxEvictPageCount)) + if (hlogBase.logSizeTracker is null) + return; + + var maxHeadAddress = untilAddress - LogSizeTracker.MinEvictionHeadAddressLag; + while (hlogBase.AllocatedPageCount > hlogBase.MaxAllocatedPageCount && recoveryStatus.headAddress < maxHeadAddress) { - // Evict pages one at a time - for (var ii = 0; ii < maxEvictPageCount; ii++) + var hp = hlogBase.GetPage(recoveryStatus.headAddress); + if (hlogBase.IsAllocated(hlogBase.GetPageIndexForPage(hp))) { - if (hlogBase.logSizeTracker is not null && ii >= minEvictPageCount && !hlogBase.logSizeTracker.IsBeyondSizeLimitToReadPages(numPagesToRead)) + var objectIdMap = hlogBase._wrapper.GetPageObjectIdMap(hp); + if (objectIdMap is not null && objectIdMap.Count > 0) break; - var page = startPage + ii; - var pageIndex = hlogBase.GetPageIndexForPage(page); - if (hlogBase.IsAllocated(pageIndex)) - { - await recoveryStatus.WaitFlushAsync(pageIndex, cancellationToken).ConfigureAwait(false); - hlogBase.EvictPageForRecovery(page); - lastFreedPage = page; - } + hlogBase.EvictPageForRecovery(hp); } + recoveryStatus.headAddress = hlogBase.GetFirstValidLogicalAddressOnPage(hp + 1); } - - return lastFreedPage; } - private (long end, long freedPage) ReadPagesForRecovery(long untilAddress, RecoveryStatus recoveryStatus, long endPage, int numPagesToReadPerIteration, long page) + private async ValueTask ReadPagesForRecoveryAsync(long untilAddress, RecoveryStatus recoveryStatus, long endPage, int numPagesToReadPerIteration, long page, CancellationToken cancellationToken) { var readEndPage = Math.Min(page + numPagesToReadPerIteration, endPage); if (page < readEndPage) { var numPagesToRead = (int)(readEndPage - page); - // Ensure that page slots that will be read into, have been flushed from previous reads. Due to the use of a single read semaphore, - // this must be done in batches of "all flushes' followed by "all reads" to ensure proper sequencing of reads when - // we are not using the full BufferSize (and thus the page-read index is not equal to the page-flush index). - WaitUntilAllPagesHaveBeenFlushed(page, readEndPage, recoveryStatus); - return (readEndPage, ReadPagesWithMemoryConstraint(untilAddress, recoveryStatus, page, readEndPage, numPagesToRead)); - } - - return (readEndPage, NoPageFreed); - } - - private async ValueTask<(long end, long freedPage)> ReadPagesForRecoveryAsync(long untilAddress, RecoveryStatus recoveryStatus, long endPage, int numPagesToReadPerIteration, long page, CancellationToken cancellationToken) - { - var readEndPage = Math.Min(page + numPagesToReadPerIteration, endPage); - if (page < readEndPage) - { - var numPagesToRead = (int)(readEndPage - page); - - // Ensure that page slots that will be read into, have been flushed from previous reads. Due to the use of a single read semaphore, - // this must be done in batches of "all flushes' followed by "all reads" to ensure proper sequencing of reads when + // Ensure that page slots that will be read into have been flushed from previous reads. Due to the use of a single read semaphore, + // this must be done in batches of all flushes followed by all reads to ensure proper sequencing of reads when // usableCapacity != capacity (and thus the page-read index is not equal to the page-flush index). await WaitUntilAllPagesHaveBeenFlushedAsync(page, readEndPage, recoveryStatus, cancellationToken).ConfigureAwait(false); - return (readEndPage, ReadPagesWithMemoryConstraint(untilAddress, recoveryStatus, page, readEndPage, numPagesToRead)); + ReadPagesWithMemoryConstraint(untilAddress, recoveryStatus, page, readEndPage, numPagesToRead); } - return (readEndPage, NoPageFreed); + return readEndPage; } /// - /// Synchronously recover the hybrid log from hybrid log files (not snapshot files). This also deserializes any objects or overflow and creates - /// entries for them in the . + /// Asynchronously recover the hybrid log from hybrid log files (not snapshot files). /// /// The address to start scanning from; the lowest address at which we will bring pages into the circular buffer (may be in the middle of a page) /// The address from which to perform recovery (undo v+1 records and append to tag-chain tail) @@ -799,102 +692,52 @@ private async Task TrimLogMemorySizeAsync(RecoveryStatus recoveryStatus, l /// The next version of the database at the time of checkpoint flush /// The type of checkpoint /// The recovery options - /// The last freed page, if it was necessary to free any to limit heap memory - private long RecoverHybridLog(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, CheckpointType checkpointType, RecoveryOptions options) + /// The cancellation token + private async ValueTask RecoverHybridLogAsync(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, + CheckpointType checkpointType, long headAddress, RecoveryOptions options, CancellationToken cancellationToken) { - long lastFreedPage = NoPageFreed; - if (untilAddress <= scanFromAddress) - return lastFreedPage; - var recoveryStatus = GetPageRangesToRead(scanFromAddress, untilAddress, checkpointType, out long startPage, out long endPage, out int numPagesToReadPerIteration); + recoveryStatus.headAddress = headAddress; - Debug.Assert(hlogBase.logSizeTracker is null || numPagesToReadPerIteration == 1, "numPagesToReadPerIteration must be 1 when tracking sizes"); - for (var page = startPage; page < endPage; page += numPagesToReadPerIteration) + // Free any pages still allocated below startPage before reading. The store is freshly constructed for recovery with the allocator's minimum + // pages allocated at page 0 (Head=Begin=Tail=0); when the checkpoint's BeginAddress is above page 0 those low pages lie below the first page we + // read (startPage), and the upward-only read/eviction never reaches them. Freeing them up front keeps them out of AllocatedPageCount for the + // whole budget-checked recovery, instead of carrying the dead pages through every budget check and reclaiming them at the end. + for (var deadPage = 0L; deadPage < startPage && deadPage < hlogBase.BufferSize; deadPage++) { - var (end, freedPage) = ReadPagesForRecovery(untilAddress, recoveryStatus, endPage, numPagesToReadPerIteration, page); - if (freedPage != NoPageFreed) - lastFreedPage = freedPage; - - var trimPageReadCount = numPagesToReadPerIteration; - for (var p = page; p < end; p++) - { - // Ensure page has been read into memory - int pageIndex = hlogBase.GetPageIndexForPage(p); - recoveryStatus.WaitRead(pageIndex); - - if (hlogBase.logSizeTracker is not null) - { - // Trim the log memory again in case we read large objects on the current page. Add 1 to tailPage so that - // when the BufferSize subtraction wraps around the buffer it won't try to evict the page we just added. - // Decrease trimPageReadCount as we process each page so we don't over-prune. - freedPage = TrimLogMemorySize(recoveryStatus, tailPage: p + 1, trimPageReadCount--); - if (freedPage != NoPageFreed) - lastFreedPage = freedPage; - } - - // We make an extra pass to clear locks when reading every page back into memory - ClearBitsOnPage(p, untilAddress, options); - ProcessReadPageAndFlush(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, options, recoveryStatus, p, pageIndex); - } + if (hlogBase.IsAllocated(hlogBase.GetPageIndexForPage(deadPage))) + hlogBase.EvictPageForRecovery(deadPage); } - WaitUntilAllPagesHaveBeenFlushed(startPage, endPage, recoveryStatus); - return lastFreedPage; - } - - /// - /// Synchronously recover the hybrid log from hybrid log files (not snapshot files). This also deserializes any objects or overflow and creates - /// entries for them in the . - /// - /// The address to start scanning from; the lowest address at which we will bring pages into the circular buffer (may be in the middle of a page) - /// The address from which to perform recovery (undo v+1 records and append to tag-chain tail) - /// The last address to scan; this is initially the tailAddress at the time of checkpoint flush, - /// The next version of the database at the time of checkpoint flush - /// The type of checkpoint - /// The recovery options - /// The cancellation token - /// The last freed page, if it was necessary to free any to limit heap memory - private async ValueTask RecoverHybridLogAsync(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, - CheckpointType checkpointType, RecoveryOptions options, CancellationToken cancellationToken) - { - long lastFreedPage = NoPageFreed; if (untilAddress <= scanFromAddress) - return lastFreedPage; + return recoveryStatus; - var recoveryStatus = GetPageRangesToRead(scanFromAddress, untilAddress, checkpointType, out long startPage, out long endPage, out int numPagesToReadPerIteration); - - Debug.Assert(hlogBase.logSizeTracker is null || numPagesToReadPerIteration == 1, "numPagesToReadPerIteration must be 1 when tracking sizes"); for (long page = startPage; page < endPage; page += numPagesToReadPerIteration) { - var (end, freedPage) = await ReadPagesForRecoveryAsync(untilAddress, recoveryStatus, endPage, numPagesToReadPerIteration, page, cancellationToken).ConfigureAwait(false); - if (freedPage != NoPageFreed) - lastFreedPage = freedPage; - - var trimPageReadCount = numPagesToReadPerIteration; + var end = await ReadPagesForRecoveryAsync(untilAddress, recoveryStatus, endPage, numPagesToReadPerIteration, page, cancellationToken).ConfigureAwait(false); for (var p = page; p < end; p++) { // Ensure page has been read into memory var pageIndex = hlogBase.GetPageIndexForPage(p); await recoveryStatus.WaitReadAsync(pageIndex, cancellationToken).ConfigureAwait(false); - if (hlogBase.logSizeTracker is not null) - { - // Trim the log memory again in case we read large objects on the current page. Add 1 to tailPage so that - // when the BufferSize subtraction wraps around the buffer it won't try to evict the page we just added. - // Decrease trimPageReadCount as we process each page so we don't over-prune. - freedPage = await TrimLogMemorySizeAsync(recoveryStatus, tailPage: p + 1, trimPageReadCount--, cancellationToken).ConfigureAwait(false); - if (freedPage != NoPageFreed) - lastFreedPage = freedPage; - } - // We make an extra pass to clear locks when reading every page back into memory - ClearBitsOnPage(p, untilAddress, options); - ProcessReadPageAndFlush(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, options, recoveryStatus, p, pageIndex); + ClearBitsOnPage(p, untilAddress, in options, recoveryStatus.headAddress); + ProcessReadPageAndFlush(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, in options, recoveryStatus, p, pageIndex); } } await WaitUntilAllPagesHaveBeenFlushedAsync(startPage, endPage, recoveryStatus, cancellationToken).ConfigureAwait(false); - return lastFreedPage; + + // Defer object loading when this is the hybrid-log phase of a snapshot recovery; RecoverHybridLogFromSnapshotFileAsync + // loads the objects once after the snapshot pages have also been read (without their objects), so the final headAddress + // (after eviction over the full recovered range) is honored. For FoldOver there is no following snapshot phase. + if (checkpointType != CheckpointType.Snapshot) + { + RecoveryLoadObjectsPass2(recoveryStatus, recoveryStatus.headAddress, untilAddress, objectLogDevice: null); + TrimResidentPagesToBudget(recoveryStatus, untilAddress); + } + return recoveryStatus; } /// @@ -915,9 +758,16 @@ private RecoveryStatus GetPageRangesToRead(long scanFromAddress, long untilAddre if (untilAddress > hlogBase.GetFirstValidLogicalAddressOnPage(endPage) && untilAddress > scanFromAddress) endPage++; - // If heap memory is to be tracked, then read one page at a time to control memory usage - var totalPagesToRead = (int)(endPage - startPage); - numPagesToReadPerIteration = hlogBase.logSizeTracker is null ? Math.Min(hlogBase.BufferSize, totalPagesToRead) : 1; + // Read as many pages as buffer allows, leaving room for at least 1 page for eviction. + numPagesToReadPerIteration = Math.Min(hlogBase.BufferSize - 1, (int)(endPage - startPage)); + + // Never read more pages per batch than the memory budget allows. BufferSize can exceed MaxAllocatedPageCount when the budget is not a + // power-of-two page count (e.g. a 23k budget => MaxAllocatedPageCount 5, BufferSize 8); reading a full BufferSize-1 batch would fill the + // circular buffer above MaxAllocatedPageCount, leaving over-budget pages resident that read-time eviction (TrimLogPages) cannot reclaim + // because they were read below the eviction floor (untilAddress - MinEvictionHeadAddressLag). MaxAllocatedPageCount is the allocator's hard + // cap on AllocatedPageCount, so honoring it here keeps recovery within budget at every step (modulo the MinEvictionHeadAddressLag tail). + if (hlogBase.logSizeTracker is not null && hlogBase.MaxAllocatedPageCount < numPagesToReadPerIteration) + numPagesToReadPerIteration = hlogBase.MaxAllocatedPageCount; return new RecoveryStatus(hlogBase.BufferSize); } @@ -932,7 +782,7 @@ private RecoveryStatus GetPageRangesToRead(long scanFromAddress, long untilAddre /// The instance /// The page number to process /// The index of in the allocator's circular page buffer - private void ProcessReadPageAndFlush(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, RecoveryOptions options, + private void ProcessReadPageAndFlush(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, in RecoveryOptions options, RecoveryStatus recoveryStatus, long page, int pageIndex) { if (ProcessReadPage(recoverFromAddress, untilAddress, nextVersion, options, recoveryStatus, page, pageIndex)) @@ -957,7 +807,7 @@ private void ProcessReadPageAndFlush(long scanFromAddress, long recoverFromAddre /// The page number to process /// The index of in the allocator's circular page buffer /// - private bool ProcessReadPage(long recoverFromAddress, long untilAddress, long nextVersion, RecoveryOptions options, RecoveryStatus recoveryStatus, + private bool ProcessReadPage(long recoverFromAddress, long untilAddress, long nextVersion, in RecoveryOptions options, RecoveryStatus recoveryStatus, long page, int pageIndex) { var startLogicalAddressOfPage = hlogBase.GetLogicalAddressOfStartOfPage(page); // Do not offset for page header; that's done below and in RecoverFromPage @@ -989,12 +839,6 @@ private bool ProcessReadPage(long recoverFromAddress, long untilAddress, long ne return false; } - private void WaitUntilAllPagesHaveBeenFlushed(long startPage, long endPage, RecoveryStatus recoveryStatus) - { - for (long page = startPage; page < endPage; page++) - recoveryStatus.WaitFlush(hlogBase.GetPageIndexForPage(page)); - } - private async ValueTask WaitUntilAllPagesHaveBeenFlushedAsync(long startPage, long endPage, RecoveryStatus recoveryStatus, CancellationToken cancellationToken) { for (long page = startPage; page < endPage; page++) @@ -1002,7 +846,7 @@ private async ValueTask WaitUntilAllPagesHaveBeenFlushedAsync(long startPage, lo } /// - /// Synchronously recover the hybrid log from snapshot files + /// Asynchronously recover the hybrid log from snapshot files. /// /// The address to start scanning from; the lowest address at which we will bring pages into the circular buffer (may be in the middle of a page) /// The address from which to perform recovery (undo v+1 records and append to tag-chain tail) @@ -1011,24 +855,33 @@ private async ValueTask WaitUntilAllPagesHaveBeenFlushedAsync(long startPage, lo /// The end of the snapshot; the tailAddress at the start of the WAIT_FLUSH phase /// The next version of the database at the time of checkpoint flush /// The checkpoint token guid + /// The headAddress resulting from the preceding hybrid-log recovery phase (the lowest resident address); seeds eviction tracking here /// The recovery options - /// The last freed page, if it was necessary to free any to limit heap memory - private long RecoverHybridLogFromSnapshotFile(long scanFromAddress, long recoverFromAddress, long untilAddress, - long snapshotStartAddress, long snapshotEndAddress, long nextVersion, Guid guid, RecoveryOptions options) + /// The final headAddress (lowest resident address) after reading the snapshot pages and loading objects + private async ValueTask RecoverHybridLogFromSnapshotFileAsync(long scanFromAddress, long recoverFromAddress, long untilAddress, + long snapshotStartAddress, long snapshotEndAddress, long nextVersion, Guid guid, long headAddress, RecoveryOptions options, + CancellationToken cancellationToken) { - long lastFreedPage = NoPageFreed; GetSnapshotPageRangesToRead(scanFromAddress, untilAddress, snapshotStartAddress, snapshotEndAddress, guid, out long startPage, out long endPage, out long snapshotEndPage, out var recoveryStatus, out int numPagesToReadPerIteration); + // Seed the head from the preceding hybrid-log phase so the snapshot-read loop (TrimLogPages) and the deferred object load + // evict from, and track, the correct lowest-resident address across the full recovered range. + recoveryStatus.headAddress = headAddress; + + // The snapshot region is the boundary page (the page containing scanFromAddress) and every page above it; pages strictly below it are the + // hybrid-log region. RecoverSnapshotPages flushes every snapshot page to the main log AND copies its objects into the main object-log, so + // snapshot pages are fully durable and may be evicted during recovery (read-time via TrimLogPages or load-time below) — required to recover + // into a smaller memory budget than was checkpointed. The boundary is used below to choose the object-log device for deferred deserialization. + var boundaryPageStart = hlogBase.GetLogicalAddressOfStartOfPage(hlogBase.GetPage(scanFromAddress)); + // Notify application of checkpoint token before processing snapshot records if (storeFunctions.CallOnDiskRead) storeFunctions.OnRecovery(guid); for (long page = startPage; page < endPage; page += numPagesToReadPerIteration) { - var (_, freedPage) = ReadPagesForRecovery(snapshotEndAddress, recoveryStatus, snapshotEndPage, numPagesToReadPerIteration, page); - if (freedPage != NoPageFreed) - lastFreedPage = freedPage; + _ = await ReadPagesForRecoveryAsync(snapshotEndAddress, recoveryStatus, snapshotEndPage, numPagesToReadPerIteration, page, cancellationToken).ConfigureAwait(false); var end = Math.Min(page + numPagesToReadPerIteration, endPage); for (long p = page; p < end; p++) @@ -1037,22 +890,14 @@ private long RecoverHybridLogFromSnapshotFile(long scanFromAddress, long recover if (p < snapshotEndPage) { // Ensure the page is read from file - recoveryStatus.WaitRead(pageIndex); - - if (hlogBase.logSizeTracker is not null) - { - // Trim the log memory again in case we read large objects on the current page. Use 0 for numPagesToRead so we don't over-prune. - freedPage = TrimLogMemorySize(recoveryStatus, tailPage: p + 1, 0); - if (freedPage != NoPageFreed) - lastFreedPage = freedPage; - } + await recoveryStatus.WaitReadAsync(pageIndex, cancellationToken).ConfigureAwait(false); // We make an extra pass to clear locks when reading pages back into memory - ClearBitsOnPage(p, untilAddress, options, snapshotFromAddress: scanFromAddress); + ClearBitsOnPage(p, untilAddress, in options, recoveryStatus.headAddress, snapshotFromAddress: scanFromAddress); } else { - recoveryStatus.WaitFlush(pageIndex); + await recoveryStatus.WaitFlushAsync(pageIndex, cancellationToken).ConfigureAwait(false); if (!hlogBase.IsAllocated(pageIndex)) hlog.AllocatePage(pageIndex); else @@ -1060,88 +905,148 @@ private long RecoverHybridLogFromSnapshotFile(long scanFromAddress, long recover } } - RecoverSnapshotPages(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, options, + RecoverSnapshotPages(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, in options, endPage, snapshotEndPage, numPagesToReadPerIteration, recoveryStatus, page, end); } - WaitUntilAllPagesHaveBeenFlushed(startPage, endPage, recoveryStatus); + await WaitUntilAllPagesHaveBeenFlushedAsync(startPage, endPage, recoveryStatus, cancellationToken).ConfigureAwait(false); + + // Deferred object load over the full recovered range, honoring the final headAddress. Phase 2 read the snapshot pages as full + // pages, so the page containing scanFromAddress (the boundary page, boundaryPageStart) and every page above it were read from the + // snapshot device and their live records reference the snapshot object-log; pages strictly below the boundary page were read by the + // hybrid-log phase from the main object-log. The device boundary is therefore page-aligned at boundaryPageStart (computed above). + + // Snapshot region (boundary page and above): deserialize resident pages' objects from the snapshot object-log device (the live records + // still carry their snapshot positions). These pages are now durable on the main log/object-log (RecoverSnapshotPages copied their objects), + // so evict pages as needed to honor the memory budget; an evicted record is simply read back from the main log/object-log on demand. + RecoveryLoadObjectsPass2(recoveryStatus, Math.Max(recoveryStatus.headAddress, boundaryPageStart), untilAddress, recoveryStatus.objectLogRecoveryDevice); + + // Hybrid-log region (below the boundary page): read objects from the main object-log device, evicting pages as needed to honor the + // memory budget. These pages are durable on the main log/object-log, so an evicted record is simply read back from disk on demand. + if (recoveryStatus.headAddress < boundaryPageStart) + RecoveryLoadObjectsPass2(recoveryStatus, recoveryStatus.headAddress, boundaryPageStart, objectLogDevice: null); + + // Bring AllocatedPageCount within the hard MaxAllocatedPageCount cap for object-free pages (see TrimResidentPagesToBudget): the per-batch + // read-time trim targets the delta-padded highTargetSize and does not run after the final batch, so an inline store can settle one page over. + TrimResidentPagesToBudget(recoveryStatus, untilAddress); + + var finalHeadAddress = recoveryStatus.headAddress; recoveryStatus.Dispose(); - return lastFreedPage; + return finalHeadAddress; } /// - /// Asynchronously recover the hybrid log from snapshot files + /// Load (deserialize) objects for the recovered pages in the address range [, ), + /// reading the object log from (null = the main object-log device). The page range is derived from the + /// addresses. /// - /// The address to start scanning from; the lowest address at which we will bring pages into the circular buffer (may be in the middle of a page) - /// The address from which to perform recovery (undo v+1 records and append to tag-chain tail) - /// The last address to scan; this is initially the tailAddress at the time of checkpoint flush, - /// The start of the mutable region; the FlushedUntilAddress at the start of the WAIT_FLUSH phase - /// The end of the snapshot; the tailAddress at the start of the WAIT_FLUSH phase - /// The next version of the database at the time of checkpoint flush - /// The checkpoint token guid - /// The recovery options - /// The last freed page, if it was necessary to free any to limit heap memory - private async ValueTask RecoverHybridLogFromSnapshotFileAsync(long scanFromAddress, long recoverFromAddress, long untilAddress, - long snapshotStartAddress, long snapshotEndAddress, long nextVersion, Guid guid, RecoveryOptions options, - CancellationToken cancellationToken) + /// The instance; its headAddress is the eviction floor and is advanced as pages are evicted + /// The lowest address whose objects are to be loaded (the load floor; pages below it are not loaded by this call) + /// The end of the range whose objects are to be loaded + /// The object-log device to read from; null means the main object-log device + private void RecoveryLoadObjectsPass2(RecoveryStatus recoveryStatus, long fromAddress, long untilAddress, IDevice objectLogDevice) { - long lastFreedPage = NoPageFreed; - GetSnapshotPageRangesToRead(scanFromAddress, untilAddress, snapshotStartAddress, snapshotEndAddress, guid, out long startPage, - out long endPage, out long snapshotEndPage, out var recoveryStatus, out int numPagesToReadPerIteration); + if (fromAddress >= untilAddress) + return; - // Notify application of checkpoint token before processing snapshot records - if (storeFunctions.CallOnDiskRead) - storeFunctions.OnRecovery(guid); + var startPage = hlogBase.GetPage(fromAddress); + var endPage = hlogBase.GetPage(untilAddress); + if (untilAddress > hlogBase.GetFirstValidLogicalAddressOnPage(endPage)) + endPage++; - for (long page = startPage; page < endPage; page += numPagesToReadPerIteration) + // Load all objects from fromAddress to untilAddress with no eviction when there is no size tracker. + if (hlogBase.logSizeTracker is null) { - var (_, freedPage) = await ReadPagesForRecoveryAsync(snapshotEndAddress, recoveryStatus, snapshotEndPage, numPagesToReadPerIteration, page, cancellationToken).ConfigureAwait(false); - if (freedPage != NoPageFreed) - lastFreedPage = freedPage; - var end = Math.Min(page + numPagesToReadPerIteration, endPage); + for (var page = startPage; page < endPage; page++) + { + var pageIndex = hlogBase.GetPageIndexForPage(page); + if (!hlogBase.IsAllocated(pageIndex)) + continue; - for (long p = page; p < end; p++) + var pageFromAddress = page == startPage ? fromAddress : hlogBase.GetFirstValidLogicalAddressOnPage(page); + var pageUntilAddress = page == endPage - 1 ? untilAddress : hlogBase.GetLogicalAddressOfStartOfPage(page + 1); + hlogBase.LoadObjectsForRecoveryPass2(page, pageFromAddress, pageUntilAddress, objectLogDevice); + } + return; + } + + // With a size tracker, iterate pages from highest (untilAddress) to lowest (fromAddress) with budget control, evicting pages (and moving headAddress up) as needed. + var maxHeadAddress = untilAddress - LogSizeTracker.MinEvictionHeadAddressLag; + + for (var page = endPage - 1; page >= startPage; page--) + { + var pageIndex = hlogBase.GetPageIndexForPage(page); + if (!hlogBase.IsAllocated(pageIndex)) + continue; + + var pageFromAddress = Math.Max(fromAddress, hlogBase.GetFirstValidLogicalAddressOnPage(page)); + var pageUntilAddress = page == endPage - 1 ? untilAddress : hlogBase.GetLogicalAddressOfStartOfPage(page + 1); + if (pageFromAddress >= pageUntilAddress) + continue; + + // Enforce MinEvictionHeadAddressLag: clamp pageFromAddress + if (pageFromAddress > maxHeadAddress) + pageFromAddress = maxHeadAddress; + + var totalPageObjectSize = hlogBase.CalculatePageObjectSizes(page, pageFromAddress, pageUntilAddress); + if (totalPageObjectSize == 0) { - int pageIndex = hlogBase.GetPageIndexForPage(p); - if (p < snapshotEndPage) - { - // Ensure the page is read from file - await recoveryStatus.WaitReadAsync(pageIndex, cancellationToken).ConfigureAwait(false); + hlogBase.LoadObjectsForRecoveryPass2(page, pageFromAddress, pageUntilAddress, objectLogDevice); + continue; + } - if (hlogBase.logSizeTracker is not null) - { - // Trim the log memory again in case we read large objects on the current page. Use 0 for numPagesToRead so we don't over-prune. - freedPage = await TrimLogMemorySizeAsync(recoveryStatus, tailPage: p + 1, numPagesToRead: 0, cancellationToken).ConfigureAwait(false); - if (freedPage != NoPageFreed) - lastFreedPage = freedPage; - } + var remainingBudget = hlogBase.logSizeTracker.RemainingBudget; + var pageCutoff = hlogBase.FindHeadAddressCutoffOnPage(page, pageUntilAddress, totalPageObjectSize, (int)(page - hlogBase.GetPage(recoveryStatus.headAddress)), remainingBudget, out var numPagesBelowToEvict); - // We make an extra pass to clear locks when reading pages back into memory - ClearBitsOnPage(p, untilAddress, options, snapshotFromAddress: scanFromAddress); - } - else + // Evict pages below if needed + var currentHeadPage = hlogBase.GetPage(recoveryStatus.headAddress); + while (numPagesBelowToEvict > 0 && currentHeadPage < page) + { + var headPageIndex = hlogBase.GetPageIndexForPage(currentHeadPage); + if (hlogBase.IsAllocated(headPageIndex)) + hlogBase.EvictPageForRecovery(currentHeadPage); + + currentHeadPage++; + recoveryStatus.headAddress = hlogBase.GetFirstValidLogicalAddressOnPage(currentHeadPage); + numPagesBelowToEvict--; + } + + // Load objects, using per-record budget checking via DeserializeObjectsOnPage. + // The method handles all records from pageCutoff to pageUntilAddress. + hlogBase.LoadObjectsForRecoveryPass2(page, pageCutoff, pageUntilAddress, objectLogDevice); + + // After loading, recheck budget. If over budget, evict from headAddress up to and including loaded records. + if (hlogBase.logSizeTracker.IsOverBudget && recoveryStatus.headAddress < maxHeadAddress) + { + // Evict from headAddress upward until under budget or at the lag limit + while (hlogBase.logSizeTracker.IsOverBudget && recoveryStatus.headAddress < maxHeadAddress) { - await recoveryStatus.WaitFlushAsync(pageIndex, cancellationToken).ConfigureAwait(false); - if (!hlogBase.IsAllocated(pageIndex)) - hlog.AllocatePage(pageIndex); - else - hlogBase.ClearPage(pageIndex); + currentHeadPage = hlogBase.GetPage(recoveryStatus.headAddress); + if (currentHeadPage >= page) + break; + + var headPageIndex = hlogBase.GetPageIndexForPage(currentHeadPage); + if (hlogBase.IsAllocated(headPageIndex)) + hlogBase.EvictPageForRecovery(currentHeadPage); + + recoveryStatus.headAddress = hlogBase.GetFirstValidLogicalAddressOnPage(currentHeadPage + 1); } } - RecoverSnapshotPages(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, options, - endPage, snapshotEndPage, numPagesToReadPerIteration, recoveryStatus, page, end); - } + // Update headAddress from cutoff if it was raised + if (pageCutoff > recoveryStatus.headAddress) + recoveryStatus.headAddress = pageCutoff; - await WaitUntilAllPagesHaveBeenFlushedAsync(startPage, endPage, recoveryStatus, cancellationToken).ConfigureAwait(false); - recoveryStatus.Dispose(); - return lastFreedPage; + // If headAddress is on or above the current page, we're done + if (recoveryStatus.headAddress >= hlogBase.GetFirstValidLogicalAddressOnPage(page)) + break; + } } /// /// For each page in the snapshot from [page, end), process the page for recovery. /// - private void RecoverSnapshotPages(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, RecoveryOptions options, + private void RecoverSnapshotPages(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, in RecoveryOptions options, long endPage, long snapshotEndPage, int numPagesToRead, RecoveryStatus recoveryStatus, long page, long end) { for (long p = page; p < end; p++) @@ -1152,10 +1057,24 @@ private void RecoverSnapshotPages(long scanFromAddress, long recoverFromAddress, if (recoverFromAddress < endLogicalAddress && recoverFromAddress < untilAddress) ProcessReadSnapshotPage(recoverFromAddress, untilAddress, nextVersion, options, recoveryStatus, p, pageIndex); - // Issue next read - if (p + numPagesToRead < endPage) + if (hlogBase.IsObjectAllocator && hlogBase.logSizeTracker is not null) + { + // Object store under a memory budget (a size tracker is attached, so pages may be evicted during recovery — both read-time via + // TrimLogPages and load-time during the deferred object load). Flush every snapshot page to the main log, copying its objects from the + // snapshot object-log into the main object-log so the page is fully durable before it can be evicted, letting us recover into a smaller + // memory budget than was checkpointed. (Without a size tracker no eviction occurs, so we avoid these writes — which also keeps configs + // whose page size exceeds the main-log device segment, that never flush to the main log, working as before.) The objectLogRecoveryDevice + // is the snapshot object-log (copy source); the boundary page's flush starts at scanFromAddress, so only its snapshot-region records are + // processed. + recoveryStatus.flushStatus[pageIndex] = FlushStatus.Pending; + hlogBase.AsyncFlushPagesForRecovery(scanFromAddress, p, 1, AsyncFlushPageCallbackForRecovery, recoveryStatus, + recoveryStatus.objectLogRecoveryDevice, formerFlushedUntilAddress: scanFromAddress); + } + else if (!hlogBase.IsObjectAllocator && p + numPagesToRead < endPage) { - // Flush snapshot page to main log + // String store: records are fully inline, so a snapshot page is durable once written to the main log (no object copy needed) and the + // deferred object load is a no-op (so it never evicts). Flush only pages that will be pushed out of the buffer by subsequent reads, so + // read-time eviction can reclaim them; the final resident set (the last batch) stays in memory, as before. recoveryStatus.flushStatus[pageIndex] = FlushStatus.Pending; hlogBase.AsyncFlushPagesForRecovery(scanFromAddress, p, 1, AsyncFlushPageCallbackForRecovery, recoveryStatus); } @@ -1201,13 +1120,17 @@ private void GetSnapshotPageRangesToRead(long scanFromAddress, long untilAddress recoveryDevicePageOffset = snapshotStartPage }; - // Initially issue read request for all pages that can be held in memory - // If heap memory is to be tracked, then read one page at a time to control memory usage - var totalPagesToRead = (int)(snapshotEndPage - startPage); - numPagesToReadPerIteration = hlogBase.logSizeTracker is null ? Math.Min(hlogBase.BufferSize, totalPagesToRead) : 1; + // Read as many pages as buffer allows, leaving room for at least 1 page for eviction. + numPagesToReadPerIteration = Math.Min(hlogBase.BufferSize - 1, (int)(endPage - startPage)); + + // Never read more pages per batch than the memory budget allows (see GetPageRangesToRead for the full rationale): BufferSize can exceed + // MaxAllocatedPageCount when the budget is not a power-of-two page count, and a full BufferSize-1 batch would fill the circular buffer above + // MaxAllocatedPageCount with pages read below the eviction floor that TrimLogPages cannot reclaim. + if (hlogBase.logSizeTracker is not null && hlogBase.MaxAllocatedPageCount < numPagesToReadPerIteration) + numPagesToReadPerIteration = hlogBase.MaxAllocatedPageCount; } - private void ProcessReadSnapshotPage(long recoverFromAddress, long untilAddress, long nextVersion, RecoveryOptions options, RecoveryStatus recoveryStatus, long page, int pageIndex) + private void ProcessReadSnapshotPage(long recoverFromAddress, long untilAddress, long nextVersion, in RecoveryOptions options, RecoveryStatus recoveryStatus, long page, int pageIndex) { // Page at hand var startLogicalAddressOfPage = hlogBase.GetLogicalAddressOfStartOfPage(page); // Do not offset for page header; that's done below and in RecoverFromPage @@ -1244,14 +1167,14 @@ private void ProcessReadSnapshotPage(long recoverFromAddress, long untilAddress, /// Recovery options (headAddress determines if page is in-memory) /// If > 0, records at or above this address will get OnRecoverySnapshotRead. /// Records below this address are main-log records that happened to share the boundary page with the snapshot. - private void ClearBitsOnPage(long page, long untilAddress, RecoveryOptions options, long snapshotFromAddress = 0) + private void ClearBitsOnPage(long page, long untilAddress, in RecoveryOptions options, long headAddress, long snapshotFromAddress = 0) { var startLogicalAddress = hlogBase.GetLogicalAddressOfStartOfPage(page); var endLogicalAddress = hlogBase.GetLogicalAddressOfStartOfPage(page + 1); var physicalAddress = hlogBase.GetPhysicalAddress(startLogicalAddress); // no need to clear locks for records that will not end up in main memory - if (options.headAddress >= endLogicalAddress) + if (headAddress >= endLogicalAddress) return; var pageSize = hlogBase.GetPageSize(); @@ -1266,9 +1189,8 @@ private void ClearBitsOnPage(long page, long untilAddress, RecoveryOptions optio { var recordLogicalAddress = startLogicalAddress + recordOffset; - // On the snapshot path, skip records below snapshotFromAddress — - // they are main-log records on the boundary page that were already - // processed (with OnDiskRead) in the main-log recovery pass. + // On the snapshot path, skip records below snapshotFromAddress; they are main-log records on the boundary page + // that were already processed (with OnDiskRead) in the main-log recovery pass. if (snapshotFromAddress == 0 || recordLogicalAddress >= snapshotFromAddress) { storeFunctions.OnDiskRead(ref logRecord); @@ -1303,7 +1225,7 @@ private void ClearBitsOnPage(long page, long untilAddress, RecoveryOptions optio /// Recovery options /// True if we touched the page (and thus it needs to be flushed), else false private unsafe bool RecoverFromPage(long recoverFromAddress, long pageFromAddressOffset, long pageUntilAddressOffset, - long pageStartLogicalAddress, long pageStartPhysicalAddress, RecoveryOptions options) + long pageStartLogicalAddress, long pageStartPhysicalAddress, in RecoveryOptions options) { Debug.Assert(pageFromAddressOffset >= hlogBase.pageHeaderSize, $"fromLogicalAddressInPage {pageFromAddressOffset} must be >= hlogBase.pageHeaderSize {hlogBase.pageHeaderSize} (which may be 0)"); Debug.Assert(pageUntilAddressOffset <= hlogBase.GetPageSize(), $"pageSize {pageUntilAddressOffset} must be <= PageSize {hlogBase.GetPageSize()}"); @@ -1374,7 +1296,7 @@ public abstract partial class AllocatorBase : IDisp where TAllocator : IAllocator { /// - /// Restore log; called from TsavoriteLog + /// Restore log; called from TsavoriteLog. TODO: This sync version is invoked via BumpCurrentEpoch, which doesn't have async support. /// /// /// @@ -1454,7 +1376,7 @@ private bool RestoreHybridLogInitializePages(long beginAddress, long headAddress } // Passing no objectLogDevice means we'll use the one in the allocator - AsyncReadPagesForRecovery(headPage, numPages, untilAddress, recoveryStatus); + AsyncReadPagesForRecovery(headPage, numPages, untilAddress, recoveryStatus, recoveryPhase: RecoveryPhase.None); return true; } diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Tsavorite/Tsavorite.cs b/libs/storage/Tsavorite/cs/src/core/Index/Tsavorite/Tsavorite.cs index e2518faa097..b7bd70cd0c7 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Tsavorite/Tsavorite.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Tsavorite/Tsavorite.cs @@ -210,7 +210,7 @@ public TsavoriteKV(KVSettings kvSettings, TStoreFunctions storeFunctions, Func - /// Recover from the latest valid checkpoint (blocking operation) - /// - /// Number of pages to preload into memory (beyond what needs to be read for recovery) - /// Whether records with versions beyond checkpoint version need to be undone (and invalidated on log) - /// Version we actually recovered to - public long Recover(int numPagesToPreload = -1, bool undoNextVersion = true) - { - FindRecoveryInfo(-1, out var recoveredHlcInfo, out var recoveredIcInfo); - return InternalRecover(recoveredIcInfo, recoveredHlcInfo, numPagesToPreload, undoNextVersion); - } - /// /// Get the version we would recover to if we were to request recovery the specified version /// @@ -409,16 +397,6 @@ public ValueTask RecoverAsync(int numPagesToPreload = -1, bool undoNextVer return InternalRecoverAsync(recoveredIcInfo, recoveredHlcInfo, numPagesToPreload, undoNextVersion, cancellationToken); } - /// - /// Recover from specific token (blocking operation) - /// - /// Token - /// Number of pages to preload into memory after recovery - /// Whether records with versions beyond checkpoint version need to be undone (and invalidated on log) - /// Version we actually recovered to - public long Recover(Guid fullCheckpointToken, int numPagesToPreload = -1, bool undoNextVersion = true) - => InternalRecover(fullCheckpointToken, fullCheckpointToken, numPagesToPreload, undoNextVersion); - /// /// Asynchronously recover from specific token (blocking operation) /// @@ -430,17 +408,6 @@ public long Recover(Guid fullCheckpointToken, int numPagesToPreload = -1, bool u public ValueTask RecoverAsync(Guid fullCheckpointToken, int numPagesToPreload = -1, bool undoNextVersion = true, CancellationToken cancellationToken = default) => InternalRecoverAsync(fullCheckpointToken, fullCheckpointToken, numPagesToPreload, undoNextVersion, cancellationToken); - /// - /// Recover from specific index and log token (blocking operation) - /// - /// - /// - /// Number of pages to preload into memory after recovery - /// Whether records with versions beyond checkpoint version need to be undone (and invalidated on log) - /// Version we actually recovered to - public long Recover(Guid indexCheckpointToken, Guid hybridLogCheckpointToken, int numPagesToPreload = -1, bool undoNextVersion = true) - => InternalRecover(indexCheckpointToken, hybridLogCheckpointToken, numPagesToPreload, undoNextVersion); - /// /// Asynchronously recover from specific index and log token (blocking operation) /// diff --git a/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs b/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs index c368a93bd16..40d528911f3 100644 --- a/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs +++ b/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs @@ -247,7 +247,7 @@ private TsavoriteLog(TsavoriteLogSettings logSettings, bool syncRecover, ILogger { try { - Recover(-1); + RecoverAsync(-1).AsTask().GetAwaiter().GetResult(); } catch { } } @@ -543,8 +543,7 @@ public void Initialize(long beginAddress, long committedUntilAddress, long lastC CommittedBeginAddress = beginAddress; // Align monotonic page trackers to the restored address so that post-recovery producer - // drive and page-shift callbacks re-arm correctly (they only advance beyond the - // initial floor). + // drive and page-shift callbacks re-arm correctly (they only advance beyond the initial floor). var resetPage = committedUntilAddress >> allocator.LogPageSizeBits; Volatile.Write(ref lastPublishedSafeTailPage, resetPage); Volatile.Write(ref lastProducerObservedPage, resetPage); @@ -567,15 +566,16 @@ public void Initialize(long beginAddress, long committedUntilAddress, long lastC /// Recover TsavoriteLog to the specific commit number, or latest if -1 /// /// Requested commit number - public void Recover(long requestedCommitNum = -1) + /// Cancellation token + public async ValueTask RecoverAsync(long requestedCommitNum = -1, CancellationToken cancellationToken = default) { if (CommittedUntilAddress > BeginAddress) throw new TsavoriteException($"Already recovered until address {CommittedUntilAddress}"); - if (requestedCommitNum == -1) - RestoreLatest(out RecoveredCookie); - else - RestoreSpecificCommit(requestedCommitNum, out RecoveredCookie); + RecoveredCookie = requestedCommitNum == -1 + ? await RestoreLatestAsync(cancellationToken).ConfigureAwait(false) + : await RestoreSpecificCommitAsync(requestedCommitNum, cancellationToken).ConfigureAwait(false); + persistedCommitNum = commitNum; } /// @@ -587,10 +587,7 @@ public static async ValueTask CreateAsync(TsavoriteLogSettings log { var log = new TsavoriteLog(logSettings, false); if (logSettings.TryRecoverLatest) - { - var cookie = await log.RestoreLatestAsync(cancellationToken).ConfigureAwait(false); - log.RecoveredCookie = cookie; - } + await log.RecoverAsync(cancellationToken: cancellationToken).ConfigureAwait(false); return log; } @@ -2754,18 +2751,6 @@ private void SerialCommitCallbackWorker(CommitInfo commitInfo) _ = (_commitTcs?.TrySetResult(lci)); } - /// - /// Synchronously recover instance to TsavoriteLog's latest valid commit, when being used as a readonly log iterator - /// - public void RecoverReadOnly() - { - if (!readOnlyMode) - throw new TsavoriteException("This method can only be used with a read-only TsavoriteLog instance used for iteration. Set TsavoriteLogSettings.ReadOnlyMode to true during creation to indicate this."); - - RestoreLatest(out _); - SignalWaitingROIterators(); - } - /// /// Asynchronously recover instance to TsavoriteLog's latest commit, when being used as a readonly log iterator /// @@ -2774,7 +2759,8 @@ public async ValueTask RecoverReadOnlyAsync(CancellationToken cancellationToken if (!readOnlyMode) throw new TsavoriteException("This method can only be used with a read-only TsavoriteLog instance used for iteration. Set TsavoriteLogSettings.ReadOnlyMode to true during creation to indicate this."); - _ = await RestoreLatestAsync(cancellationToken).ConfigureAwait(false); + RecoveredCookie = await RestoreLatestAsync(cancellationToken).ConfigureAwait(false); + persistedCommitNum = commitNum; SignalWaitingROIterators(); } @@ -2814,9 +2800,11 @@ private bool LoadCommitMetadata(long commitNum, out TsavoriteLogRecoveryInfo inf return true; } - private void RestoreLatest(out byte[] cookie) + /// + /// Restore log asynchronously + /// + private async ValueTask RestoreLatestAsync(CancellationToken cancellationToken) { - cookie = null; TsavoriteLogRecoveryInfo info = new(); long scanStart = 0; @@ -2836,7 +2824,7 @@ private void RestoreLatest(out byte[] cookie) // Only in fast commit mode will we potentially need to recover from an entry in the log if (fastCommitMode) { - // Disable safe guards temporarily + // Shut up safe guards, I know what I am doing CommittedUntilAddress = long.MaxValue; beginAddress = info.BeginAddress; allocator.HeadAddress = long.MaxValue; @@ -2848,19 +2836,18 @@ private void RestoreLatest(out byte[] cookie) catch { } } - // If until address is 0, that means info is still its default value and we haven't been able to recover + // if until address is 0, that means info is still its default value and we haven't been able to recover // from any any commit. Set the log to its start position and return if (info.UntilAddress == 0) { - logger?.LogInformation("Unable to recover using any available commit"); - - // Reset variables to normal + logger?.LogDebug("Unable to recover using any available commit"); + // Reset things to be something normal lol allocator.Initialize(); CommittedUntilAddress = FirstValidAddress; beginAddress = allocator.BeginAddress; if (readOnlyMode) allocator.HeadAddress = long.MaxValue; - return; + return null; } if (!readOnlyMode) @@ -2871,33 +2858,32 @@ private void RestoreLatest(out byte[] cookie) if (headAddress == 0) headAddress = FirstValidAddress; - try { - allocator.RestoreHybridLog(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress); + await allocator.RestoreHybridLogAsync(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress, cancellationToken: cancellationToken).ConfigureAwait(false); } catch { - if (!tolerateDeviceFailure) throw; + if (!tolerateDeviceFailure) + throw; } } CompleteRestoreFromCommit(info); - cookie = info.Cookie; + var cookie = info.Cookie; commitNum = info.CommitNum; - // After recovery, persisted commitnum remains 0 so we need to set it to latest commit number - persistedCommitNum = info.CommitNum; beginAddress = allocator.BeginAddress; if (readOnlyMode) allocator.HeadAddress = long.MaxValue; if (scanStart > 0) logCommitManager.OnRecovery(scanStart); + + return cookie; } - private void RestoreSpecificCommit(long requestedCommitNum, out byte[] cookie) + private async ValueTask RestoreSpecificCommitAsync(long requestedCommitNum, CancellationToken cancellationToken) { - cookie = null; TsavoriteLogRecoveryInfo info = new(); // Find the closest commit metadata with commit num smaller than requested @@ -2951,84 +2937,13 @@ private void RestoreSpecificCommit(long requestedCommitNum, out byte[] cookie) headAddress = FirstValidAddress; try { - allocator.RestoreHybridLog(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress); + await allocator.RestoreHybridLogAsync(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress, cancellationToken: cancellationToken).ConfigureAwait(false); } catch { - if (!tolerateDeviceFailure) throw; - } - } - - CompleteRestoreFromCommit(info); - cookie = info.Cookie; - commitNum = persistedCommitNum = info.CommitNum; - beginAddress = allocator.BeginAddress; - if (readOnlyMode) - allocator.HeadAddress = long.MaxValue; - - if (scanStart > 0) - logCommitManager.OnRecovery(scanStart); - } - - /// - /// Restore log asynchronously - /// - private async ValueTask RestoreLatestAsync(CancellationToken cancellationToken) - { - TsavoriteLogRecoveryInfo info = new(); - - long scanStart = 0; - foreach (var metadataCommit in logCommitManager.ListCommits()) - { - try - { - if (LoadCommitMetadata(metadataCommit, out info)) - { - scanStart = metadataCommit; - break; - } - } - catch { } - } - - // Only in fast commit mode will we potentially need to recover from an entry in the log - if (fastCommitMode) - { - // Shut up safe guards, I know what I am doing - CommittedUntilAddress = long.MaxValue; - beginAddress = info.BeginAddress; - allocator.HeadAddress = long.MaxValue; - try - { - using var scanIterator = Scan(info.UntilAddress, long.MaxValue, recover: false); - _ = scanIterator.ScanForwardForCommit(ref info); + if (!tolerateDeviceFailure) + throw; } - catch { } - } - - // if until address is 0, that means info is still its default value and we haven't been able to recover - // from any any commit. Set the log to its start position and return - if (info.UntilAddress == 0) - { - logger?.LogDebug("Unable to recover using any available commit"); - // Reset things to be something normal lol - allocator.Initialize(); - CommittedUntilAddress = FirstValidAddress; - beginAddress = allocator.BeginAddress; - if (readOnlyMode) - allocator.HeadAddress = long.MaxValue; - return null; - } - - if (!readOnlyMode) - { - var headAddress = info.UntilAddress - allocator.GetOffsetOnPage(info.UntilAddress); - if (info.BeginAddress > headAddress) - headAddress = info.BeginAddress; - - if (headAddress == 0) - headAddress = FirstValidAddress; - await allocator.RestoreHybridLogAsync(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress, cancellationToken: cancellationToken).ConfigureAwait(false); } CompleteRestoreFromCommit(info); diff --git a/libs/storage/Tsavorite/cs/src/core/Utilities/PageAsyncResultTypes.cs b/libs/storage/Tsavorite/cs/src/core/Utilities/PageAsyncResultTypes.cs index 9bff4c26769..d67073c9c8a 100644 --- a/libs/storage/Tsavorite/cs/src/core/Utilities/PageAsyncResultTypes.cs +++ b/libs/storage/Tsavorite/cs/src/core/Utilities/PageAsyncResultTypes.cs @@ -10,6 +10,8 @@ namespace Tsavorite.core { + internal enum RecoveryPhase : byte { None = 0, Pass1 = 1, Pass2 = 2 } + /// /// Result of async page read /// @@ -47,12 +49,12 @@ public sealed class PageAsyncReadResult /// The max offset on the main log page to iterate records when determining how many bytes in the ObjectLog to read. internal long maxAddressOffsetOnPage; - /// If true, we are called from recovery, and should use the non-transient . - internal bool isForRecovery; + /// The recovery phase for this read. Non- uses the non-transient . + internal RecoveryPhase recoveryPhase; /// public override string ToString() - => $"page {page}, isRecov {isForRecovery}, devPgOffset {devicePageOffset}, ctx {context}, countdown {handle?.CurrentCount}, destPtr {destinationPtr} (0x{destinationPtr:X}), maxPtr {maxAddressOffsetOnPage}"; + => $"page {page}, recovPhase {recoveryPhase}, devPgOffset {devicePageOffset}, ctx {context}, countdown {handle?.CurrentCount}, destPtr {destinationPtr} (0x{destinationPtr:X}), maxPtr {maxAddressOffsetOnPage}"; /// Currently nothing to free. public void Free() @@ -128,6 +130,14 @@ public sealed class PageAsyncFlushResult /// If this is set then we are using a different objectLog device from that in the allocator, and do not use the allocator's . internal ObjectLogFilePositionInfo objectLogFilePositionInfo; + /// During snapshot recovery, the snapshot object-log device that is the source for copying object bytes into the main object-log + /// (for records at/above ). Null for non-recovery flushes and for the hybrid-log region. + internal IDevice recoverySnapshotObjectLogDevice; + + /// During snapshot recovery, the former FlushedUntilAddress (the hybrid-log/snapshot boundary). Records whose logical address is at or + /// above this are in the snapshot region and their objects must be copied from the snapshot object-log to the main object-log during the flush. + internal long recoveryFormerFlushedUntilAddress; + /// public override string ToString() { diff --git a/libs/storage/Tsavorite/cs/test/MiscTests.cs b/libs/storage/Tsavorite/cs/test/MiscTests.cs index d1d85537882..cc9b7112c1f 100644 --- a/libs/storage/Tsavorite/cs/test/MiscTests.cs +++ b/libs/storage/Tsavorite/cs/test/MiscTests.cs @@ -3,6 +3,7 @@ using System; using System.IO; +using System.Threading.Tasks; using Garnet.test; using NUnit.Framework; using NUnit.Framework.Legacy; @@ -41,7 +42,7 @@ public void TearDown() [Test] [Category("TsavoriteKV")] - public void ForceRCUAndRecover([Values(UpdateOp.Upsert, UpdateOp.Delete)] UpdateOp updateOp) + public async Task ForceRCUAndRecover([Values(UpdateOp.Upsert, UpdateOp.Delete)] UpdateOp updateOp) { var copyOnWrite = new FunctionsCopyOnWrite(); ClientSession session = default; @@ -115,7 +116,7 @@ public void ForceRCUAndRecover([Values(UpdateOp.Upsert, UpdateOp.Delete)] Update , (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions) ); - _ = store.Recover(token); + _ = await store.RecoverAsync(token).ConfigureAwait(false); session = store.NewSession(copyOnWrite); bContext = session.BasicContext; diff --git a/libs/storage/Tsavorite/cs/test/SharedDirectoryTests.cs b/libs/storage/Tsavorite/cs/test/SharedDirectoryTests.cs index 4d20b12fbe3..5a7aa8f9e6a 100644 --- a/libs/storage/Tsavorite/cs/test/SharedDirectoryTests.cs +++ b/libs/storage/Tsavorite/cs/test/SharedDirectoryTests.cs @@ -51,7 +51,7 @@ public void TearDown() [Category("TsavoriteKV")] [Category("CheckpointRestore")] [Category("Smoke")] - public async ValueTask SharedLogDirectory([Values] bool isAsync) + public async ValueTask SharedLogDirectory() { original.Initialize(Path.Join(TestUtils.MethodTestDir, "OriginalCheckpoint"), sharedLogDirectory); ClassicAssert.IsTrue(IsDirectoryEmpty(sharedLogDirectory)); // sanity check @@ -72,10 +72,7 @@ public async ValueTask SharedLogDirectory([Values] bool isAsync) // Recover from original checkpoint clone.Initialize(cloneCheckpointDirectory, sharedLogDirectory, populateLogHandles: true); - if (isAsync) - _ = await clone.Store.RecoverAsync(checkpointGuid).ConfigureAwait(false); - else - _ = clone.Store.Recover(checkpointGuid); + _ = await clone.Store.RecoverAsync(checkpointGuid).ConfigureAwait(false); // Both sessions should work concurrently Test(original, checkpointGuid); diff --git a/libs/storage/Tsavorite/cs/test/TestUtils.cs b/libs/storage/Tsavorite/cs/test/TestUtils.cs index 7090cb8e7a1..be9c472189b 100644 --- a/libs/storage/Tsavorite/cs/test/TestUtils.cs +++ b/libs/storage/Tsavorite/cs/test/TestUtils.cs @@ -220,8 +220,6 @@ public enum AllocatorType Object } - public enum CompletionSyncMode { Sync, Async } - public enum ReadCopyDestination { Tail, ReadCache } public enum FlushMode { NoFlush, ReadOnly, OnDisk } diff --git a/libs/storage/Tsavorite/cs/test/test.hlog/FlakyDeviceTests.cs b/libs/storage/Tsavorite/cs/test/test.hlog/FlakyDeviceTests.cs index bf408ca5295..06735b5a2b0 100644 --- a/libs/storage/Tsavorite/cs/test/test.hlog/FlakyDeviceTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.hlog/FlakyDeviceTests.cs @@ -203,6 +203,5 @@ public async ValueTask FlakyLogTestTolerateFailure([Values] IteratorType iterato } recoveredLog.Dispose(); } - } } \ No newline at end of file diff --git a/libs/storage/Tsavorite/cs/test/test.hlog/LogFastCommitTests.cs b/libs/storage/Tsavorite/cs/test/test.hlog/LogFastCommitTests.cs index 6539699e21b..b04683de1ad 100644 --- a/libs/storage/Tsavorite/cs/test/test.hlog/LogFastCommitTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.hlog/LogFastCommitTests.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.IO; using System.Threading; +using System.Threading.Tasks; using NUnit.Framework; using NUnit.Framework.Legacy; using Tsavorite.core; @@ -23,7 +24,7 @@ internal class LogFastCommitTests : TsavoriteLogTestBase [Test] [Category("TsavoriteLog")] [Category("Smoke")] - public void TsavoriteLogSimpleFastCommitTest([Values] TestUtils.TestDeviceType deviceType) + public async Task TsavoriteLogSimpleFastCommitTest([Values] TestUtils.TestDeviceType deviceType) { var cookie = new byte[100]; new Random().NextBytes(cookie); @@ -70,13 +71,13 @@ public void TsavoriteLogSimpleFastCommitTest([Values] TestUtils.TestDeviceType d // Recovery should still work var recoveredLog = new TsavoriteLog(logSettings); - recoveredLog.Recover(1); + await recoveredLog.RecoverAsync(1).ConfigureAwait(false); ClassicAssert.AreEqual(cookie1, recoveredLog.RecoveredCookie); ClassicAssert.AreEqual(commit1Addr, recoveredLog.TailAddress); recoveredLog.Dispose(); recoveredLog = new TsavoriteLog(logSettings); - recoveredLog.Recover(2); + await recoveredLog.RecoverAsync(2).ConfigureAwait(false); ClassicAssert.AreEqual(cookie2, recoveredLog.RecoveredCookie); ClassicAssert.AreEqual(commit2Addr, recoveredLog.TailAddress); recoveredLog.Dispose(); diff --git a/libs/storage/Tsavorite/cs/test/test.hlog/LogRecoverReadOnlyTests.cs b/libs/storage/Tsavorite/cs/test/test.hlog/LogRecoverReadOnlyTests.cs index 985bb96f9e6..b10ebfd071b 100644 --- a/libs/storage/Tsavorite/cs/test/test.hlog/LogRecoverReadOnlyTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.hlog/LogRecoverReadOnlyTests.cs @@ -50,15 +50,15 @@ public void TearDown() [Test] [Category("TsavoriteLog")] - public async Task RecoverReadOnlyCheck1([Values] bool isAsync) + public async Task RecoverReadOnlyCheck1() { using var device = Devices.CreateLogDevice(deviceName); var logSettings = new TsavoriteLogSettings { LogDevice = device, MemorySizeBits = MinKvLogMemorySizeBits, PageSizeBits = MinKvLogPageSizeBits, MutableFraction = 0.5, SegmentSizeBits = MinKvLogPageSizeBits + 1, TryRecoverLatest = false }; - using var log = isAsync ? await TsavoriteLog.CreateAsync(logSettings).ConfigureAwait(false) : new TsavoriteLog(logSettings); + using var log = await TsavoriteLog.CreateAsync(logSettings).ConfigureAwait(false); await Task.WhenAll(ProducerAsync(log, cts), CommitterAsync(log, cts.Token), - ReadOnlyConsumerAsync(deviceName, isAsync, cts.Token)).ConfigureAwait(false); + ReadOnlyConsumerAsync(deviceName, cts.Token)).ConfigureAwait(false); } private async Task ProducerAsync(TsavoriteLog log, CancellationTokenSource cts) @@ -86,17 +86,17 @@ private static async Task CommitterAsync(TsavoriteLog log, CancellationToken can catch (OperationCanceledException) { } } - // This creates a separate TsavoriteLog over the same log file, using RecoverReadOnly to continuously update + // This creates a separate TsavoriteLog over the same log file, using RecoverReadOnlyAsync to continuously update // to the primary TsavoriteLog's commits. - private async Task ReadOnlyConsumerAsync(string deviceName, bool isAsync, CancellationToken cancellationToken) + private async Task ReadOnlyConsumerAsync(string deviceName, CancellationToken cancellationToken) { using var device = Devices.CreateLogDevice(deviceName); var logSettings = new TsavoriteLogSettings { LogDevice = device, ReadOnlyMode = true, PageSizeBits = MinKvLogPageSizeBits, SegmentSizeBits = MinKvLogPageSizeBits + 1 }; - using var log = isAsync ? await TsavoriteLog.CreateAsync(logSettings, cancellationToken).ConfigureAwait(false) : new TsavoriteLog(logSettings); + using var log = await TsavoriteLog.CreateAsync(logSettings, cancellationToken).ConfigureAwait(false); var _ = BeginRecoverAsyncLoop(); - // This enumerator waits asynchronously when we have reached the committed tail of the duplicate TsavoriteLog. When RecoverReadOnly + // This enumerator waits asynchronously when we have reached the committed tail of the duplicate TsavoriteLog. When RecoverReadOnlyAsync // reads new data committed by the primary TsavoriteLog, it signals commit completion to let iter continue to the new tail. using var iter = log.Scan(log.BeginAddress, long.MaxValue); var prevValue = -1L; @@ -127,12 +127,7 @@ async Task BeginRecoverAsyncLoop() { try { - if (isAsync) - { - await log.RecoverReadOnlyAsync(cancellationToken).ConfigureAwait(false); - } - else - log.RecoverReadOnly(); + await log.RecoverReadOnlyAsync(cancellationToken).ConfigureAwait(false); break; } catch @@ -140,7 +135,7 @@ async Task BeginRecoverAsyncLoop() Thread.Yield(); // retry until timeout if (DateTimeOffset.UtcNow.Ticks - startTime > TimeSpan.FromSeconds(5).Ticks) - throw new Exception("Timed out retrying RecoverReadOnly"); + throw new Exception("Timed out retrying RecoverReadOnlyAsync"); } } } diff --git a/libs/storage/Tsavorite/cs/test/test.hlog/LogTests.cs b/libs/storage/Tsavorite/cs/test/test.hlog/LogTests.cs index 00b19cd27ae..1a4a33c2480 100644 --- a/libs/storage/Tsavorite/cs/test/test.hlog/LogTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.hlog/LogTests.cs @@ -1048,7 +1048,7 @@ public void TsavoriteLogSimpleCommitCookieTest([Values] bool fastCommit) [Test] [Category("TsavoriteLog")] - public void TsavoriteLogManualCommitTest() + public async Task TsavoriteLogManualCommitTest() { device = Devices.CreateLogDevice(Path.Join(MethodTestDir, "logManualCommitTest.log"), deleteOnClose: true); var logSettings = new TsavoriteLogSettings @@ -1089,13 +1089,13 @@ public void TsavoriteLogManualCommitTest() ClassicAssert.IsTrue(commitSuccessful); var recoveredLog = new TsavoriteLog(logSettings); - recoveredLog.Recover(1); + await recoveredLog.RecoverAsync(1).ConfigureAwait(false); ClassicAssert.AreEqual(cookie1, recoveredLog.RecoveredCookie); ClassicAssert.AreEqual(commit1Addr, recoveredLog.TailAddress); recoveredLog.Dispose(); recoveredLog = new TsavoriteLog(logSettings); - recoveredLog.Recover(2); + await recoveredLog.RecoverAsync(2).ConfigureAwait(false); ClassicAssert.AreEqual(cookie2, recoveredLog.RecoveredCookie); ClassicAssert.AreEqual(commit2Addr, recoveredLog.TailAddress); recoveredLog.Dispose(); @@ -1104,7 +1104,7 @@ public void TsavoriteLogManualCommitTest() try { recoveredLog = new TsavoriteLog(logSettings); - recoveredLog.Recover(4); + await recoveredLog.RecoverAsync(4).ConfigureAwait(false); Assert.Fail(); } catch (TsavoriteException) diff --git a/libs/storage/Tsavorite/cs/test/test.recordops/RecordLifecycleTests.cs b/libs/storage/Tsavorite/cs/test/test.recordops/RecordLifecycleTests.cs index 6e4531dba20..ad0a7609110 100644 --- a/libs/storage/Tsavorite/cs/test/test.recordops/RecordLifecycleTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.recordops/RecordLifecycleTests.cs @@ -253,7 +253,7 @@ public void CopyUpdateDoesNotFireOnDisposeCopyUpdated() } ClassicAssert.AreEqual(0, tracker.DisposeCount(DisposeReason.CopyUpdated), - "CopyUpdated is handled internally by logSizeTracker — OnDispose must not fire for it"); + "CopyUpdated is handled internally by logSizeTracker; OnDispose must not fire for it"); ClassicAssert.AreEqual(0, tracker.DisposeCount(DisposeReason.Deleted), "Deleted must not fire on a CopyUpdate path"); ClassicAssert.AreEqual(0, tracker.TotalEvict(), diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ComponentRecoveryTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ComponentRecoveryTests.cs index 13e2a08d4c0..2ef4c98818d 100644 --- a/libs/storage/Tsavorite/cs/test/test.recovery/ComponentRecoveryTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.recovery/ComponentRecoveryTests.cs @@ -3,6 +3,7 @@ using System; using System.IO; +using System.Threading; using System.Threading.Tasks; using Garnet.test; using NUnit.Framework; @@ -68,15 +69,12 @@ private static unsafe void Finish_MallocFixedPageSizeRecoveryTest(int seed, IDev [Test] [Category("CheckpointRestore")] [Category("Smoke")] - public void MallocFixedPageSizeRecoveryTest() + public async Task MallocFixedPageSizeRecoveryTest() { Setup_MallocFixedPageSizeRecoveryTest(out int seed, out IDevice device, out int numBucketsToAdd, out long[] logicalAddresses, out ulong numBytesWritten); var recoveredAllocator = new MallocFixedPageSize(); - //issue call to recover - recoveredAllocator.BeginRecovery(device, 0, numBucketsToAdd, numBytesWritten, out ulong numBytesRead); - //wait until complete - recoveredAllocator.IsRecoveryCompleted(true); + var numBytesRead = await recoveredAllocator.RecoverAsync(device, 0, numBucketsToAdd, numBytesWritten, CancellationToken.None).ConfigureAwait(false); Finish_MallocFixedPageSizeRecoveryTest(seed, device, numBucketsToAdd, logicalAddresses, numBytesWritten, recoveredAllocator, numBytesRead); } @@ -158,7 +156,7 @@ private static unsafe void Finish_FuzzyIndexRecoveryTest(int seed, long numAdds, [Test] [Category("CheckpointRestore")] [Category("Smoke")] - public unsafe void FuzzyIndexRecoveryTest() + public async Task FuzzyIndexRecoveryTest() { Setup_FuzzyIndexRecoveryTest(out int seed, out int size, out long numAdds, out IDevice ht_device, out IDevice ofb_device, out TsavoriteBase hash_table1, out ulong ht_num_bytes_written, out ulong ofb_num_bytes_written, out int num_ofb_buckets); @@ -166,10 +164,7 @@ public unsafe void FuzzyIndexRecoveryTest() var hash_table2 = new TsavoriteBase(); hash_table2.Initialize(size, 512); - //issue recover call - hash_table2.RecoverFuzzyIndex(0, ht_device, ht_num_bytes_written, ofb_device, num_ofb_buckets, ofb_num_bytes_written); - //wait until complete - hash_table2.IsFuzzyIndexRecoveryComplete(true); + await hash_table2.RecoverFuzzyIndexAsync(0, ht_device, ht_num_bytes_written, ofb_device, num_ofb_buckets, ofb_num_bytes_written, CancellationToken.None).ConfigureAwait(false); Finish_FuzzyIndexRecoveryTest(seed, numAdds, ht_device, ofb_device, hash_table1, hash_table2); } diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/LargeObjectTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/LargeObjectTests.cs index e01ee15c67a..452f103cc87 100644 --- a/libs/storage/Tsavorite/cs/test/test.recovery/LargeObjectTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.recovery/LargeObjectTests.cs @@ -68,7 +68,7 @@ public async ValueTask LargeObjectTest([Values(CheckpointType.Snapshot, Checkpoi , StoreFunctions.Create(new TestObjectKey.Comparer(), () => new TestLargeObjectValue.Serializer()) , (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions))) { - _ = store.Recover(token); + _ = await store.RecoverAsync(token).ConfigureAwait(false); using (var session = store.NewSession(new TestLargeObjectFunctions())) DoRead(session, numObjects, store); @@ -147,7 +147,7 @@ public async ValueTask MultiListObjectTest([Values(CheckpointType.Snapshot, Chec DoRead(session, numObjects, store); _ = store.TryInitiateFullCheckpoint(out token, checkpointType); - await store.CompleteCheckpointAsync(); + await store.CompleteCheckpointAsync().ConfigureAwait(false); } // Step 1: Create and recover store. @@ -157,7 +157,7 @@ public async ValueTask MultiListObjectTest([Values(CheckpointType.Snapshot, Chec , StoreFunctions.Create(new TestObjectKey.Comparer(), () => new TestMultiListObjectValue.Serializer()) , (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions))) { - _ = store.Recover(token); + _ = await store.RecoverAsync(token).ConfigureAwait(false); using (var session = store.NewSession(new TestMultiListObjectFunctions())) DoRead(session, numObjects, store); diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoverySnapshotEvictionTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoverySnapshotEvictionTests.cs new file mode 100644 index 00000000000..1b0d3c27693 --- /dev/null +++ b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoverySnapshotEvictionTests.cs @@ -0,0 +1,207 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +using System.IO; +using System.Threading.Tasks; +using Garnet.test; +using NUnit.Framework; +using NUnit.Framework.Legacy; +using Tsavorite.core; +using static Tsavorite.test.TestUtils; + +namespace Tsavorite.test.recovery.objects +{ + using ClassAllocator = ObjectAllocator>; + using ClassStoreFunctions = StoreFunctions; + + /// + /// Exercises the deferred object-load path of snapshot recovery (see RecoverHybridLogFromSnapshotFileAsync): + /// the hybrid-log phase reads its pages without loading their objects, then after the snapshot pages have also been + /// read (without their objects), objects are loaded once over the full recovered range honoring the final headAddress. + /// The recovered range spans both the hybrid-log region (objects in the main object-log) and the snapshot region + /// (objects in the snapshot object-log), with the device boundary at the page that contains FlushedUntilAddress. + /// A is optionally attached to the recovery store to force + /// eviction during the deferred load, covering: no eviction (both region loads run over resident pages), partial + /// eviction (headAddress stays in the hybrid-log region so both loads run), and heavy eviction (headAddress is pushed + /// into the snapshot region so only the snapshot-region load runs). A non-power-of-2 buffer is also covered. + /// + [TestFixture] + public class ObjectRecoverySnapshotEvictionTests : TestBase + { + const int NumRecords = 6000; + + [SetUp] + public void Setup() => RecreateDirectory(MethodTestDir); + + [TearDown] + public void TearDown() => TestUtils.OnTearDown(); + + // recoveryTargetPageCount: 0 => no size tracker (no eviction); otherwise attach a tracker whose target is that many + // pages, forcing eviction during recovery. 4 is the minimum (LogSizeTracker.MinTargetPageCount). Small values force + // the snapshot-only load; larger values leave the head in the hybrid-log region so both region loads run. + // logMemoryPages: the max allocated page count; 24 is not a power of two, so BufferSize (next power of two = 32) + // has empty slots that the load loop must skip. + [Test] + [Category("TsavoriteKV"), Category("CheckpointRestore")] + public async Task SnapshotRecoveryDeferredObjectLoad( + [Values(0, 4, 8, 20, 64)] int recoveryTargetPageCount, + [Values(32, 24)] int logMemoryPages) + { + var logMemorySize = (long)logMemoryPages * MinKvLogPageSize; + + // Write records (spanning many pages so some are flushed to the main log before the checkpoint, creating a + // hybrid-log region) and take a Snapshot checkpoint capturing the still-mutable region as the snapshot region. + Prepare(logMemorySize, out var log, out var objlog, out var store); + try + { + using (var session = store.NewSession(new TestObjectFunctions())) + { + var bContext = session.BasicContext; + for (var i = 0; i < NumRecords; i++) + _ = bContext.Upsert(new TestObjectKey { key = i }, new TestObjectValue { value = i }); + } + + _ = store.TryInitiateHybridLogCheckpoint(out var token, CheckpointType.Snapshot); + await store.CompleteCheckpointAsync().AsTask().ConfigureAwait(false); + Destroy(log, objlog, store); + + // Recover into a fresh store, optionally under memory pressure so the deferred object load must evict. + Prepare(logMemorySize, out log, out objlog, out store); + if (recoveryTargetPageCount > 0) + { + var targetSize = (long)recoveryTargetPageCount * MinKvLogPageSize; + var tracker = new LogSizeTracker(store.Log, targetSize, targetSize / 8, targetSize / 16, logger: null); + store.Log.SetLogSizeTracker(tracker); + } + + _ = await store.RecoverAsync(default, token).ConfigureAwait(false); + + // Every record must recover correctly, whether it ended up resident or was evicted (and is read from disk). + using (var session = store.NewSession(new TestObjectFunctions())) + { + var bContext = session.BasicContext; + for (var i = 0; i < NumRecords; i++) + { + var key = new TestObjectKey { key = i }; + TestObjectInput input = default; + TestObjectOutput output = new(); + var status = bContext.Read(key, ref input, ref output); + if (status.IsPending) + { + Assert.That(bContext.CompletePendingWithOutputs(out var completedOutputs, wait: true), Is.True); + (status, output) = GetSinglePendingResult(completedOutputs); + } + + ClassicAssert.IsTrue(status.Found, $"key {i} not found (target pages {recoveryTargetPageCount}, mem pages {logMemoryPages})"); + ClassicAssert.AreEqual(i, output.value.value, $"key {i} wrong value"); + } + } + + // With a small memory budget, eviction must have advanced the head above the begin address. + if (recoveryTargetPageCount is > 0 and <= 8) + ClassicAssert.Greater(store.Log.HeadAddress, store.Log.BeginAddress, "expected eviction to advance HeadAddress"); + } + finally + { + Destroy(log, objlog, store); + } + } + + // After recovering an object store into a smaller memory budget (so snapshot object pages are evicted and their objects are read back from the + // main object-log that RecoverSnapshotPages copied them into), compact the log and truncate it, then verify every record is still readable. + // Compaction reads each live record's objects from the main object-log (validating the copied positions), and Truncate drops the now-stale main-log + // and object-log segments using each page header's lowest-object-log position (which the recovery flush set to the main object-log). + [Test] + [Category("TsavoriteKV"), Category("CheckpointRestore")] + public async Task SnapshotRecoveryThenCompactTruncate( + [Values] CompactionType compactionType, + [Values(32, 24)] int logMemoryPages) + { + var logMemorySize = (long)logMemoryPages * MinKvLogPageSize; + + Prepare(logMemorySize, out var log, out var objlog, out var store); + try + { + using (var session = store.NewSession(new TestObjectFunctions())) + { + var bContext = session.BasicContext; + for (var i = 0; i < NumRecords; i++) + _ = bContext.Upsert(new TestObjectKey { key = i }, new TestObjectValue { value = i }); + } + + _ = store.TryInitiateHybridLogCheckpoint(out var token, CheckpointType.Snapshot); + store.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult(); + Destroy(log, objlog, store); + + // Recover under memory pressure so snapshot object pages are evicted during recovery (their objects must be read back from the main object-log). + Prepare(logMemorySize, out log, out objlog, out store); + var targetSize = 8L * MinKvLogPageSize; + var tracker = new LogSizeTracker(store.Log, targetSize, targetSize / 8, targetSize / 16, logger: null); + store.Log.SetLogSizeTracker(tracker); + + _ = await store.RecoverAsync(default, token).ConfigureAwait(false); + + // Recovery has forced eviction of snapshot object pages (their objects were copied into the main object-log). Relax the budget before + // compaction so the tight recovery target does not starve Compact's allocation (Compact copies live records to the tail); the log still + // spills to disk via its normal LogMemorySize-driven eviction, so compaction continues to read evicted records' objects from the main object-log. + tracker.UpdateTargetSize(1L << 30, 1L << 27, 1L << 26); + + using (var session = store.NewSession(new TestObjectFunctions())) + { + var bContext = session.BasicContext; + + // Compact the entire recovered region (reading each live record's objects from the main object-log), then truncate the stale segments. + var compactUntil = session.Compact(store.Log.TailAddress, compactionType); + store.Log.Truncate(); + ClassicAssert.AreEqual(compactUntil, store.Log.BeginAddress, "BeginAddress should advance to compactUntil after Truncate"); + + // Every record must still be readable after Compact + Truncate. + for (var i = 0; i < NumRecords; i++) + { + var key = new TestObjectKey { key = i }; + TestObjectInput input = default; + TestObjectOutput output = new(); + var status = bContext.Read(key, ref input, ref output); + if (status.IsPending) + { + Assert.That(bContext.CompletePendingWithOutputs(out var completedOutputs, wait: true), Is.True); + (status, output) = GetSinglePendingResult(completedOutputs); + } + + ClassicAssert.IsTrue(status.Found, $"key {i} not found after compact/truncate (compactionType {compactionType}, mem pages {logMemoryPages})"); + ClassicAssert.AreEqual(i, output.value.value, $"key {i} wrong value after compact/truncate"); + } + } + } + finally + { + Destroy(log, objlog, store); + } + } + + private static void Prepare(long logMemorySize, out IDevice log, out IDevice objlog, out TsavoriteKV store) + { + log = Devices.CreateLogDevice(Path.Combine(MethodTestDir, "snapevict.log")); + objlog = Devices.CreateLogDevice(Path.Combine(MethodTestDir, "snapevict.obj.log")); + store = new(new() + { + IndexSize = 1L << 22, + LogDevice = log, + ObjectLogDevice = objlog, + SegmentSize = 1L << 20, + LogMemorySize = logMemorySize, + PageSize = MinKvLogPageSize, + CheckpointDir = Path.Combine(MethodTestDir, "check-points") + }, StoreFunctions.Create(new TestObjectKey.Comparer(), () => new TestObjectValue.Serializer()) + , (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions) + ); + } + + private static void Destroy(IDevice log, IDevice objlog, TsavoriteKV store) + { + store.Dispose(); + log.Dispose(); + objlog.Dispose(); + } + } +} \ No newline at end of file diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest.cs index a263af618fc..63e5d9cf6a8 100644 --- a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest.cs +++ b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest.cs @@ -11,7 +11,6 @@ namespace Tsavorite.test.recovery.objects { - using static Tsavorite.test.TestUtils; using ClassAllocator = ObjectAllocator>; using ClassStoreFunctions = StoreFunctions; @@ -78,15 +77,12 @@ private void PrepareToRecover() [Test] [Category("TsavoriteKV"), Category("CheckpointRestore")] - public async ValueTask ObjectRecoveryTest1([Values] CompletionSyncMode syncMode) + public async ValueTask ObjectRecoveryTest1() { Populate(); PrepareToRecover(); - if (syncMode == CompletionSyncMode.Async) - _ = await store.RecoverAsync(token, token).ConfigureAwait(false); - else - _ = store.Recover(token, token); + _ = await store.RecoverAsync(token, token).ConfigureAwait(false); Verify(token, token); } diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest2.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest2.cs index 48efb240fc0..c77171d8be1 100644 --- a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest2.cs +++ b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest2.cs @@ -38,8 +38,7 @@ public void TearDown() public async ValueTask ObjectRecoveryTest2( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Range(300, 700, 300)] int numberOfRecords, - [Values] CompletionSyncMode syncMode) + [Range(300, 700, 300)] int numberOfRecords) { this.numberOfRecords = numberOfRecords; @@ -53,7 +52,7 @@ public async ValueTask ObjectRecoveryTest2( session.Dispose(); _ = store.TryInitiateFullCheckpoint(out var guid, checkpointType); // guid is useful for debugging, but not otherwise used in this test - await store.CompleteCheckpointAsync(); + await store.CompleteCheckpointAsync().ConfigureAwait(false); Destroy(log, objlog, store); } @@ -62,10 +61,7 @@ public async ValueTask ObjectRecoveryTest2( { Prepare(out var log, out var objlog, out var store); - if (syncMode == CompletionSyncMode.Async) - _ = await store.RecoverAsync().ConfigureAwait(false); - else - _ = store.Recover(); + _ = await store.RecoverAsync().ConfigureAwait(false); var session = store.NewSession(new TestObjectFunctions()); Read(session, delete: true); diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest3.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest3.cs index 1f2483c04e5..905834b63c0 100644 --- a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest3.cs +++ b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest3.cs @@ -36,8 +36,7 @@ public void TearDown() [Category("TsavoriteKV"), Category("CheckpointRestore")] public async ValueTask ObjectRecoveryTest3( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values(1000)] int iterations, - [Values] CompletionSyncMode syncMode) + [Values(1000)] int iterations) { this.iterations = iterations; Prepare(out IDevice log, out IDevice objlog, out var store); @@ -56,10 +55,7 @@ public async ValueTask ObjectRecoveryTest3( { Prepare(out log, out objlog, out store); - if (syncMode == CompletionSyncMode.Async) - _ = await store.RecoverAsync(default, item.Item2).ConfigureAwait(false); - else - _ = store.Recover(default, item.Item2); + _ = await store.RecoverAsync(default, item.Item2).ConfigureAwait(false); var session2 = store.NewSession(new TestObjectFunctions()); Read(session2, false, item.Item1); diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryCheckTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryCheckTests.cs index a1efb3cb892..5b438859715 100644 --- a/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryCheckTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryCheckTests.cs @@ -98,7 +98,7 @@ public class RecoveryCheck1Tests : RecoveryCheckBase public async ValueTask RecoveryCheck1( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize) + [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize) { const long pageSize = MinKvLogPageSize; using var store1 = new TsavoriteKV(new() @@ -168,16 +168,8 @@ public async ValueTask RecoveryCheck1( , (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions) ); - if (completionSyncMode == CompletionSyncMode.Async) - { - var (status, token) = await task; - _ = await store2.RecoverAsync(default, token); - } - else - { - var (status, token) = task.AsTask().GetAwaiter().GetResult(); - _ = store2.Recover(default, token); - } + var (_, token) = await task.ConfigureAwait(false); + _ = await store2.RecoverAsync(default, token).ConfigureAwait(false); ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress); ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress); @@ -218,7 +210,7 @@ public class RecoveryCheck2Tests : RecoveryCheckBase //[Repeat(3000)] public async ValueTask RecoveryCheck2( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize) + [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize) { const long pageSize = MinKvLogPageSize; using var store1 = new TsavoriteKV(new() @@ -289,16 +281,8 @@ public async ValueTask RecoveryCheck2( var task = store1.TakeHybridLogCheckpointAsync(checkpointType); - if (completionSyncMode == CompletionSyncMode.Async) - { - var (status, token) = await task; - _ = await store2.RecoverAsync(default, token); - } - else - { - var (status, token) = task.AsTask().GetAwaiter().GetResult(); - _ = store2.Recover(default, token); - } + var (_, token) = await task.ConfigureAwait(false); + _ = await store2.RecoverAsync(default, token).ConfigureAwait(false); ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress, $"iter {iter}"); ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress, $"iter {iter}"); @@ -328,7 +312,7 @@ public async ValueTask RecoveryCheck2( [Test] [Category("TsavoriteKV"), Category("CheckpointRestore")] - public void RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType) + public async Task RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType) { Guid token = default; const long pageSize = MinKvLogPageSize; @@ -351,7 +335,7 @@ public void RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointTy ); if (iter > 0) - _ = store.Recover(default, token); + _ = await store.RecoverAsync(default, token).ConfigureAwait(false); using var s1 = store.NewSession(new SimpleLongSimpleFunctions()); var bc1 = s1.BasicContext; @@ -367,7 +351,7 @@ public void RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointTy var task = store.TakeHybridLogCheckpointAsync(checkpointType); bool success; - (success, token) = task.AsTask().GetAwaiter().GetResult(); + (success, token) = await task.ConfigureAwait(false); ClassicAssert.IsTrue(success); using var s2 = store.NewSession(new SimpleLongSimpleFunctions()); @@ -395,7 +379,7 @@ public void RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointTy [Test] [Category("TsavoriteKV"), Category("CheckpointRestore")] - public void RecoveryRollback([Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType) + public async Task RecoveryRollback([Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType) { const long pageSize = MinKvLogPageSize; using var store = new TsavoriteKV(new() @@ -418,7 +402,7 @@ public void RecoveryRollback([Values(CheckpointType.Snapshot, CheckpointType.Fol _ = bc1.Upsert(TestSpanByteKey.FromPinnedSpan(SpanByte.FromPinnedVariable(ref key)), SpanByte.FromPinnedVariable(ref key)); var task = store.TakeHybridLogCheckpointAsync(checkpointType); - (bool success, Guid token) = task.AsTask().GetAwaiter().GetResult(); + (bool success, Guid token) = await task.ConfigureAwait(false); ClassicAssert.IsTrue(success); for (long key = 0; key < 1000; key++) @@ -455,7 +439,7 @@ public void RecoveryRollback([Values(CheckpointType.Snapshot, CheckpointType.Fol } // Rollback to previous checkpoint - _ = store.Recover(default, token); + _ = await store.RecoverAsync(default, token).ConfigureAwait(false); for (long key = 0; key < 1000; key++) { @@ -515,7 +499,7 @@ public class RecoveryCheck3Tests : RecoveryCheckBase [Category("TsavoriteKV"), Category("CheckpointRestore")] public async ValueTask RecoveryCheck3( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize) + [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize) { const long pageSize = MinKvLogPageSize; using var store1 = new TsavoriteKV(new() @@ -586,16 +570,8 @@ public async ValueTask RecoveryCheck3( var task = store1.TakeFullCheckpointAsync(checkpointType); - if (completionSyncMode == CompletionSyncMode.Async) - { - var (status, token) = await task; - _ = await store2.RecoverAsync(default, token); - } - else - { - var (status, token) = task.AsTask().GetAwaiter().GetResult(); - _ = store2.Recover(default, token); - } + var (_, token) = await task.ConfigureAwait(false); + _ = await store2.RecoverAsync(default, token).ConfigureAwait(false); ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress, $"iter {iter}"); ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress, $"iter {iter}"); @@ -637,7 +613,7 @@ public class RecoveryCheck4Tests : RecoveryCheckBase [Category("TsavoriteKV"), Category("CheckpointRestore")] public async ValueTask RecoveryCheck4( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize) + [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize) { const long pageSize = MinKvLogPageSize; using var store1 = new TsavoriteKV(new() @@ -710,16 +686,8 @@ public async ValueTask RecoveryCheck4( _ = store1.TakeIndexCheckpointAsync().AsTask().GetAwaiter().GetResult(); var task = store1.TakeHybridLogCheckpointAsync(checkpointType); - if (completionSyncMode == CompletionSyncMode.Async) - { - var (status, token) = await task; - _ = await store2.RecoverAsync(default, token); - } - else - { - var (status, token) = task.AsTask().GetAwaiter().GetResult(); - _ = store2.Recover(default, token); - } + var (_, token) = await task.ConfigureAwait(false); + _ = await store2.RecoverAsync(default, token).ConfigureAwait(false); ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress, $"iter {iter}"); ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress, $"iter {iter}"); @@ -761,7 +729,7 @@ public class RecoveryCheck5Tests : RecoveryCheckBase [Category("CheckpointRestore")] public async ValueTask RecoveryCheck5( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values] bool isAsync, [Values] bool useReadCache, [Values(1L << 13, 1L << 16)] long indexSize) + [Values] bool useReadCache, [Values(1L << 13, 1L << 16)] long indexSize) { const long pageSize = MinKvLogPageSize; using var store1 = new TsavoriteKV(new() @@ -816,7 +784,7 @@ public async ValueTask RecoveryCheck5( } } - var result = await store1.GrowIndexAsync(); + var result = await store1.GrowIndexAsync().ConfigureAwait(false); ClassicAssert.IsTrue(result); for (long key = 0; key < 1000; key++) @@ -852,16 +820,8 @@ public async ValueTask RecoveryCheck5( , (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions) ); - if (isAsync) - { - var (status, token) = await task; - _ = await store2.RecoverAsync(default, token); - } - else - { - var (status, token) = task.AsTask().GetAwaiter().GetResult(); - _ = store2.Recover(default, token); - } + var (_, token) = await task.ConfigureAwait(false); + _ = await store2.RecoverAsync(default, token).ConfigureAwait(false); ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress); ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress); @@ -951,7 +911,7 @@ public void OnStop(bool completed, long numberOfRecords) [Category("CheckpointRestore")] [Category("Smoke")] - public async ValueTask StreamingSnapshotBasicTest([Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, + public async ValueTask StreamingSnapshotBasicTest([Values] ReadCacheMode readCacheMode, [Values] bool reInsert, [Values(1L << 13, 1L << 16)] long indexSize) { using var store1 = new TsavoriteKV(new() @@ -1049,10 +1009,7 @@ public async ValueTask StreamingSnapshotBasicTest([Values] CompletionSyncMode co // Take a streaming snapshot checkpoint of the old store var iterator = new SnapshotIterator(store2, 1000); var task = store1.TakeFullCheckpointAsync(CheckpointType.StreamingSnapshot, streamingSnapshotIteratorFunctions: iterator); - if (completionSyncMode == CompletionSyncMode.Async) - _ = await task; - else - _ = task.AsTask().GetAwaiter().GetResult(); + _ = await task.ConfigureAwait(false); // Verify that the new store has all the records using var s2 = store2.NewSession(new MyFunctions()); diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryTests.cs index c60658a922b..378680bf65c 100644 --- a/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryTests.cs @@ -81,7 +81,7 @@ private void PrepareToRecover(TestDeviceType deviceType) [Test] [Category("TsavoriteKV")] [Category("CheckpointRestore")] - public async ValueTask RecoveryTestSeparateCheckpoint([Values] CompletionSyncMode syncMode, [Values] TestDeviceType deviceType) + public async ValueTask RecoveryTestSeparateCheckpoint([Values] TestDeviceType deviceType) { Setup(deviceType); Populate(SeparateCheckpointAction); @@ -90,7 +90,7 @@ public async ValueTask RecoveryTestSeparateCheckpoint([Values] CompletionSyncMod { if (i >= indexTokens.Count) break; PrepareToRecover(deviceType); - await RecoverAndTest(i, syncMode == CompletionSyncMode.Async).ConfigureAwait(false); + await RecoverAndTest(i).ConfigureAwait(false); } } @@ -98,7 +98,7 @@ public async ValueTask RecoveryTestSeparateCheckpoint([Values] CompletionSyncMod [Category("TsavoriteKV")] [Category("CheckpointRestore")] [Category("Smoke")] - public async ValueTask RecoveryTestFullCheckpoint([Values] CompletionSyncMode syncMode, [Values] TestDeviceType deviceType) + public async ValueTask RecoveryTestFullCheckpoint([Values] TestDeviceType deviceType) { Setup(deviceType); Populate(FullCheckpointAction); @@ -106,7 +106,7 @@ public async ValueTask RecoveryTestFullCheckpoint([Values] CompletionSyncMode sy for (var i = 0; i < logTokens.Count; i++) { PrepareToRecover(deviceType); - await RecoverAndTest(i, syncMode == CompletionSyncMode.Async).ConfigureAwait(false); + await RecoverAndTest(i).ConfigureAwait(false); } } @@ -171,16 +171,13 @@ private void Populate(Action checkpointAction) _ = bContext.CompletePending(true); } - private async ValueTask RecoverAndTest(int tokenIndex, bool isAsync) + private async ValueTask RecoverAndTest(int tokenIndex) { var logToken = logTokens[tokenIndex]; var indexToken = indexTokens[tokenIndex]; // Recover - if (isAsync) - _ = await store.RecoverAsync(indexToken, logToken).ConfigureAwait(false); - else - _ = store.Recover(indexToken, logToken); + _ = await store.RecoverAsync(indexToken, logToken).ConfigureAwait(false); // Create array for reading var inputArray = GC.AllocateArray((int)NumUniqueKeys, pinned: true); @@ -293,32 +290,32 @@ private TsavoriteKV PrepareToRecover RunTest(allocatorType, () => StoreFunctions.Create(new AdId.Comparer(), SpanByteRecordTriggers.Instance), (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions), - Populate, Read, Recover, isAsync), + Populate, Read, Recover), AllocatorType.Object => RunTest(allocatorType, () => StoreFunctions.Create(new TestObjectKey.Comparer(), () => new TestObjectValue.Serializer(), DefaultRecordTriggers.Instance), (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions), - Populate, Read, Recover, isAsync), + Populate, Read, Recover), _ => throw new ApplicationException("Unknown allocator type"), }; ; @@ -329,8 +326,7 @@ private async ValueTask RunTest(AllocatorType alloc Func storeFunctionsCreator, Func allocatorCreator, Action> populateAction, Action> readAction, - Func, bool, ValueTask> recoverFunc, - bool isAsync) + Func, ValueTask> recoverFunc) where TStoreFunctions : IStoreFunctions where TAllocator : IAllocator { @@ -339,18 +335,18 @@ private async ValueTask RunTest(AllocatorType alloc readAction(store); if (smallSector) { - _ = Assert.ThrowsAsync(async () => await Checkpoint(store, isAsync).ConfigureAwait(false)); + _ = Assert.ThrowsAsync(async () => await Checkpoint(store).ConfigureAwait(false)); Assert.Pass("Verified expected exception on mismatched sector sizes; the test cannot continue, so exiting early with success"); } else - await Checkpoint(store, isAsync).ConfigureAwait(false); + await Checkpoint(store).ConfigureAwait(false); ClassicAssert.AreNotEqual(Guid.Empty, logToken); ClassicAssert.AreNotEqual(Guid.Empty, indexToken); readAction(store); store = PrepareToRecover(allocatorType, storeFunctionsCreator, allocatorCreator); - await recoverFunc(store, isAsync).ConfigureAwait(false); + await recoverFunc(store).ConfigureAwait(false); readAction(store); } @@ -399,27 +395,19 @@ private unsafe void Populate(TsavoriteKV st _ = bContext.CompletePending(true); } - private async ValueTask Checkpoint(TsavoriteKV store, bool isAsync) + private async ValueTask Checkpoint(TsavoriteKV store) where TStoreFunctions : IStoreFunctions where TAllocator : IAllocator { - if (isAsync) - { - var (success, token) = await store.TakeFullCheckpointAsync(CheckpointType.Snapshot).ConfigureAwait(false); - ClassicAssert.IsTrue(success); - logToken = token; - } - else - { - while (!store.TryInitiateFullCheckpoint(out logToken, CheckpointType.Snapshot)) { } - store.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult(); - } + var (success, token) = await store.TakeFullCheckpointAsync(CheckpointType.Snapshot).ConfigureAwait(false); + ClassicAssert.IsTrue(success); + logToken = token; indexToken = logToken; } - private async ValueTask RecoverAndReadTest(TsavoriteKV store, bool isAsync) + private async ValueTask RecoverAndReadTest(TsavoriteKV store) { - await Recover(store, isAsync).ConfigureAwait(false); + await Recover(store).ConfigureAwait(false); Read(store); } @@ -446,9 +434,9 @@ private static void Read(TsavoriteKV st } } - private async ValueTask RecoverAndReadTest(TsavoriteKV store, bool isAsync) + private async ValueTask RecoverAndReadTest(TsavoriteKV store) { - await Recover(store, isAsync).ConfigureAwait(false); + await Recover(store).ConfigureAwait(false); Read(store); } @@ -467,14 +455,11 @@ private static void Read(TsavoriteKV store) } } - private async ValueTask Recover(TsavoriteKV store, bool isAsync = false) + private async ValueTask Recover(TsavoriteKV store) where TStoreFunctions : IStoreFunctions where TAllocator : IAllocator { - if (isAsync) - _ = await store.RecoverAsync(indexToken, logToken).ConfigureAwait(false); - else - _ = store.Recover(indexToken, logToken); + _ = await store.RecoverAsync(indexToken, logToken).ConfigureAwait(false); } } } \ No newline at end of file diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/SimpleRecoveryTest.cs b/libs/storage/Tsavorite/cs/test/test.recovery/SimpleRecoveryTest.cs index 487e0e0b5a3..380b6537446 100644 --- a/libs/storage/Tsavorite/cs/test/test.recovery/SimpleRecoveryTest.cs +++ b/libs/storage/Tsavorite/cs/test/test.recovery/SimpleRecoveryTest.cs @@ -81,14 +81,14 @@ public void TearDown() [Category("TsavoriteKV"), Category("CheckpointRestore")] public async ValueTask PageBlobSimpleRecoveryTest( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values] CompletionSyncMode completionSyncMode, [Values] bool testCommitCookie) + [Values] bool testCommitCookie) { IgnoreIfNotRunningAzureTests(); checkpointManager = new CheckpointManagerWithCookie( testCommitCookie, TestUtils.AzureStorageNamedDeviceFactoryCreator, new AzureCheckpointNamingScheme($"{AzureTestContainer}/{AzureTestDirectory}")); - await SimpleRecoveryTest1_Worker(checkpointType, completionSyncMode, testCommitCookie).ConfigureAwait(false); + await SimpleRecoveryTest1_Worker(checkpointType, testCommitCookie).ConfigureAwait(false); checkpointManager.PurgeAll(); } @@ -99,18 +99,17 @@ public async ValueTask PageBlobSimpleRecoveryTest( public async ValueTask LocalDeviceSimpleRecoveryTest( [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values] CompletionSyncMode completionSyncMode, [Values] bool testCommitCookie) { checkpointManager = new CheckpointManagerWithCookie( testCommitCookie, new LocalStorageNamedDeviceFactoryCreator(), new DefaultCheckpointNamingScheme(Path.Join(MethodTestDir, "chkpt"))); - await SimpleRecoveryTest1_Worker(checkpointType, completionSyncMode, testCommitCookie).ConfigureAwait(false); + await SimpleRecoveryTest1_Worker(checkpointType, testCommitCookie).ConfigureAwait(false); checkpointManager.PurgeAll(); } - private async ValueTask SimpleRecoveryTest1_Worker(CheckpointType checkpointType, CompletionSyncMode completionSyncMode, bool testCommitCookie) + private async ValueTask SimpleRecoveryTest1_Worker(CheckpointType checkpointType, bool testCommitCookie) { log = Devices.CreateLogDevice(Path.Join(MethodTestDir, "SimpleRecoveryTest1.log"), deleteOnClose: true); @@ -152,16 +151,10 @@ private async ValueTask SimpleRecoveryTest1_Worker(CheckpointType checkpointType } _ = store1.TryInitiateFullCheckpoint(out Guid token, checkpointType); - if (completionSyncMode == CompletionSyncMode.Sync) - store1.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult(); - else - await store1.CompleteCheckpointAsync().ConfigureAwait(false); + await store1.CompleteCheckpointAsync().ConfigureAwait(false); session1.Dispose(); - if (completionSyncMode == CompletionSyncMode.Sync) - _ = store2.Recover(token); - else - _ = await store2.RecoverAsync(token).ConfigureAwait(false); + _ = await store2.RecoverAsync(token).ConfigureAwait(false); if (testCommitCookie) ClassicAssert.IsTrue(store2.RecoveredCommitCookie.SequenceEqual(checkpointManager.Cookie)); @@ -194,8 +187,7 @@ private async ValueTask SimpleRecoveryTest1_Worker(CheckpointType checkpointType [Test] [Category("TsavoriteKV"), Category("CheckpointRestore")] public async ValueTask SimpleRecoveryTest2( - [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType, - [Values] CompletionSyncMode completionSyncMode) + [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType) { checkpointManager = new CheckpointManagerWithCookie(false, new LocalStorageNamedDeviceFactoryCreator(), new DefaultCheckpointNamingScheme(Path.Join(MethodTestDir, "checkpoints4")), false); log = Devices.CreateLogDevice(Path.Join(MethodTestDir, "SimpleRecoveryTest2.log"), deleteOnClose: true); @@ -235,13 +227,10 @@ public async ValueTask SimpleRecoveryTest2( _ = bContext1.Upsert(inputArray[key], SpanByte.FromPinnedVariable(ref value), Empty.Default); } _ = store1.TryInitiateFullCheckpoint(out Guid token, checkpointType); - store1.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult(); + await store1.CompleteCheckpointAsync().ConfigureAwait(false); session1.Dispose(); - if (completionSyncMode == CompletionSyncMode.Sync) - _ = store2.Recover(token); - else - _ = await store2.RecoverAsync(token).ConfigureAwait(false); + _ = await store2.RecoverAsync(token).ConfigureAwait(false); var session2 = store2.NewSession(new AdSimpleFunctions()); var bContext2 = session1.BasicContext; @@ -260,7 +249,7 @@ public async ValueTask SimpleRecoveryTest2( [Test] [Category("TsavoriteKV"), Category("CheckpointRestore")] - public async ValueTask ShouldRecoverBeginAddress([Values] CompletionSyncMode completionSyncMode) + public async ValueTask ShouldRecoverBeginAddress() { log = Devices.CreateLogDevice(Path.Join(MethodTestDir, "SimpleRecoveryTest2.log"), deleteOnClose: true); checkpointDir = Path.Join(MethodTestDir, "checkpoints6"); @@ -305,23 +294,17 @@ public async ValueTask ShouldRecoverBeginAddress([Values] CompletionSyncMode com store1.Log.ShiftBeginAddress(address); _ = store1.TryInitiateFullCheckpoint(out Guid token, CheckpointType.FoldOver); - if (completionSyncMode == CompletionSyncMode.Sync) - store1.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult(); - else - await store1.CompleteCheckpointAsync().ConfigureAwait(false); + await store1.CompleteCheckpointAsync().ConfigureAwait(false); session1.Dispose(); - if (completionSyncMode == CompletionSyncMode.Sync) - _ = store2.Recover(token); - else - _ = await store2.RecoverAsync(token).ConfigureAwait(false); + _ = await store2.RecoverAsync(token).ConfigureAwait(false); ClassicAssert.AreEqual(address, store2.Log.BeginAddress); } [Test] [Category("TsavoriteKV"), Category("CheckpointRestore")] - public async ValueTask SimpleReadAndUpdateInfoTest([Values] CompletionSyncMode completionSyncMode) + public async ValueTask SimpleReadAndUpdateInfoTest() { checkpointManager = new CheckpointManagerWithCookie(false, new LocalStorageNamedDeviceFactoryCreator(), new DefaultCheckpointNamingScheme(Path.Join(MethodTestDir, "checkpoints")), false); log = Devices.CreateLogDevice(Path.Join(MethodTestDir, "SimpleReadAndUpdateInfoTest.log"), deleteOnClose: true); @@ -369,16 +352,10 @@ public async ValueTask SimpleReadAndUpdateInfoTest([Values] CompletionSyncMode c } } _ = store1.TryInitiateFullCheckpoint(out Guid token, CheckpointType.FoldOver); - if (completionSyncMode == CompletionSyncMode.Sync) - store1.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult(); - else - await store1.CompleteCheckpointAsync().ConfigureAwait(false); + await store1.CompleteCheckpointAsync().ConfigureAwait(false); session1.Dispose(); - if (completionSyncMode == CompletionSyncMode.Sync) - _ = store2.Recover(token); - else - _ = await store2.RecoverAsync(token).ConfigureAwait(false); + _ = await store2.RecoverAsync(token).ConfigureAwait(false); var session2 = store2.NewSession(functions2); var bContext2 = session2.BasicContext; diff --git a/libs/storage/Tsavorite/cs/test/test.session.context/TransactionalUnsafeContextTests.cs b/libs/storage/Tsavorite/cs/test/test.session.context/TransactionalUnsafeContextTests.cs index 294fe36bb92..9923a46aa38 100644 --- a/libs/storage/Tsavorite/cs/test/test.session.context/TransactionalUnsafeContextTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.session.context/TransactionalUnsafeContextTests.cs @@ -358,7 +358,7 @@ public void ManualLockCollidingHashCodes([Values] UseSingleBucketComparer /* jus [Test] [Category("TsavoriteKV")] [Category("Smoke")] - public async Task TestShiftHeadAddressLUC([Values] CompletionSyncMode syncMode) + public async Task TestShiftHeadAddressLUC() { long input = 0; const int RandSeed = 10; @@ -412,16 +412,9 @@ public async Task TestShiftHeadAddressLUC([Values] CompletionSyncMode syncMode) AssertTotalLockCounts(0, 0); - if (syncMode == CompletionSyncMode.Sync) - { - _ = luContext.CompletePending(true); - } - else - { - luContext.EndUnsafe(); - await luContext.CompletePendingAsync().ConfigureAwait(false); - luContext.BeginUnsafe(); - } + luContext.EndUnsafe(); + await luContext.CompletePendingAsync().ConfigureAwait(false); + luContext.BeginUnsafe(); // Shift head and retry - should not find in main memory now store.Log.FlushAndEvict(true); @@ -458,17 +451,9 @@ public async Task TestShiftHeadAddressLUC([Values] CompletionSyncMode syncMode) // We did not lock all keys, only the "Action" ones - one lock per bucket, all shared in this test AssertTotalLockCounts(0, expectedS); - CompletedOutputIterator outputs; - if (syncMode == CompletionSyncMode.Sync) - { - _ = luContext.CompletePendingWithOutputs(out outputs, wait: true); - } - else - { - luContext.EndUnsafe(); - outputs = await luContext.CompletePendingWithOutputsAsync().ConfigureAwait(false); - luContext.BeginUnsafe(); - } + luContext.EndUnsafe(); + var outputs = await luContext.CompletePendingWithOutputsAsync().ConfigureAwait(false); + luContext.BeginUnsafe(); foreach (var idx in EnumActionKeyIndices(lockKeys, LockOperationType.Unlock)) { diff --git a/libs/storage/Tsavorite/cs/test/test.session.context/UnsafeContextTests.cs b/libs/storage/Tsavorite/cs/test/test.session.context/UnsafeContextTests.cs index 14b8e9f7812..061161ed337 100644 --- a/libs/storage/Tsavorite/cs/test/test.session.context/UnsafeContextTests.cs +++ b/libs/storage/Tsavorite/cs/test/test.session.context/UnsafeContextTests.cs @@ -267,7 +267,7 @@ public unsafe void NativeInMemWriteRead2() [Test] [Category("TsavoriteKV")] [Category("Smoke")] - public async Task TestShiftHeadAddressUC([Values] TestDeviceType deviceType, [Values] CompletionSyncMode syncMode) + public async Task TestShiftHeadAddressUC([Values] TestDeviceType deviceType) { InputStruct input = default; const int RandSeed = 10; @@ -307,16 +307,9 @@ public async Task TestShiftHeadAddressUC([Values] TestDeviceType deviceType, [Va ClassicAssert.AreEqual(value.vfield2, output.value.vfield2); } } - if (syncMode == CompletionSyncMode.Sync) - { - _ = uContext.CompletePending(true); - } - else - { - uContext.EndUnsafe(); - await uContext.CompletePendingAsync().ConfigureAwait(false); - uContext.BeginUnsafe(); - } + uContext.EndUnsafe(); + await uContext.CompletePendingAsync().ConfigureAwait(false); + uContext.BeginUnsafe(); // Shift head and retry - should not find in main memory now store.Log.FlushAndEvict(true); @@ -333,17 +326,9 @@ public async Task TestShiftHeadAddressUC([Values] TestDeviceType deviceType, [Va ClassicAssert.IsTrue(foundStatus.IsPending); } - CompletedOutputIterator outputs; - if (syncMode == CompletionSyncMode.Sync) - { - _ = uContext.CompletePendingWithOutputs(out outputs, wait: true); - } - else - { - uContext.EndUnsafe(); - outputs = await uContext.CompletePendingWithOutputsAsync().ConfigureAwait(false); - uContext.BeginUnsafe(); - } + uContext.EndUnsafe(); + var outputs = await uContext.CompletePendingWithOutputsAsync().ConfigureAwait(false); + uContext.BeginUnsafe(); int count = 0; while (outputs.Next()) diff --git a/test/standalone/Garnet.test.collections/GarnetObjectTests.cs b/test/standalone/Garnet.test.collections/GarnetObjectTests.cs index 1acdc34b459..224d6cd4a98 100644 --- a/test/standalone/Garnet.test.collections/GarnetObjectTests.cs +++ b/test/standalone/Garnet.test.collections/GarnetObjectTests.cs @@ -63,7 +63,7 @@ public async Task WriteCheckpointRead() _ = await store.TakeHybridLogCheckpointAsync(CheckpointType.FoldOver).ConfigureAwait(false); store.Dispose(); CreateStore(); - _ = store.Recover(); + _ = await store.RecoverAsync().ConfigureAwait(false); LocalRead(); void LocalWrite() @@ -100,7 +100,7 @@ public async Task WriteCheckpointCopyUpdate() _ = await store.TakeHybridLogCheckpointAsync(CheckpointType.FoldOver).ConfigureAwait(false); store.Dispose(); CreateStore(); - _ = store.Recover(); + _ = await store.RecoverAsync().ConfigureAwait(false); LocalRead(); void LocalWrite() diff --git a/test/standalone/Garnet.test/RespConfigTests.cs b/test/standalone/Garnet.test/RespConfigTests.cs index 2a03faec1c2..06bb5221cc6 100644 --- a/test/standalone/Garnet.test/RespConfigTests.cs +++ b/test/standalone/Garnet.test/RespConfigTests.cs @@ -713,8 +713,8 @@ public void ConfigSetHeapMemorySizeUtilizationTest(int smallerSize) // Sanity-check the preconditions for the shrink/eviction we are about to trigger. var apcBefore = store.Log.AllocatedPageCount; var heapBefore = tracker.LogHeapSizeBytes; - Assert.That(apcBefore, Is.GreaterThan(LogSizeTracker.MinResizeTargetPageCount), - "Test precondition: need more than MinResizeTargetPageCount pages for eviction to be possible."); + Assert.That(apcBefore, Is.GreaterThan(1), + "Test precondition: need more than one page for eviction to be possible."); Assert.That(heapBefore, Is.GreaterThan(0), "Test precondition: heap should be non-empty after inserts."); using var trimCompleteEvent = new ManualResetEventSlim(false);