diff --git a/libs/cluster/Server/ClusterProvider.cs b/libs/cluster/Server/ClusterProvider.cs
index f8a1469c938..47e54a4335b 100644
--- a/libs/cluster/Server/ClusterProvider.cs
+++ b/libs/cluster/Server/ClusterProvider.cs
@@ -74,10 +74,8 @@ public bool AllowDataLoss
=> serverOptions.AllowDataLoss;
///
- public void Recover()
- {
- replicationManager.Recover();
- }
+ public ValueTask RecoverAsync()
+ => replicationManager.RecoverAsync();
///
public bool PreventRoleChange()
diff --git a/libs/cluster/Server/Replication/ReplicaOps/ReplicaDiskbasedSync.cs b/libs/cluster/Server/Replication/ReplicaOps/ReplicaDiskbasedSync.cs
index db03bcd7348..fb61bf108b7 100644
--- a/libs/cluster/Server/Replication/ReplicaOps/ReplicaDiskbasedSync.cs
+++ b/libs/cluster/Server/Replication/ReplicaOps/ReplicaDiskbasedSync.cs
@@ -168,7 +168,7 @@ async Task ReplicaSyncAttachTaskAsync(bool downgradeLock, bool forceAsyn
cEntry = GetLatestCheckpointEntryFromDisk();
logger?.LogCheckpointEntry(LogLevel.Information, nameof(ReplicaSyncAttachTaskAsync), cEntry);
- storeWrapper.RecoverAOF();
+ await storeWrapper.RecoverAOFAsync().ConfigureAwait(false);
logger?.LogInformation("InitiateReplicaSync: AOF BeginAddress:{beginAddress} AOF TailAddress:{tailAddress}", storeWrapper.appendOnlyFile.Log.BeginAddress, storeWrapper.appendOnlyFile.Log.TailAddress);
var beginAddress = storeWrapper.appendOnlyFile.Log.BeginAddress;
@@ -301,10 +301,12 @@ public AofAddress TryReplicaDiskbasedRecovery(
remoteCheckpoint.metadata.storeIndexToken,
remoteCheckpoint.metadata.storeHlogToken);
- storeWrapper.RecoverCheckpoint(
+#pragma warning disable VSTHRD002 // The replica-recovery RESP path is synchronous and must complete before sending a response.
+ storeWrapper.RecoverCheckpointAsync(
replicaRecover: true,
recoverStoreFromToken,
- remoteCheckpoint.metadata);
+ remoteCheckpoint.metadata).AsTask().GetAwaiter().GetResult();
+#pragma warning restore VSTHRD002
if (replayAOFMap > 0)
{
diff --git a/libs/cluster/Server/Replication/ReplicationManager.cs b/libs/cluster/Server/Replication/ReplicationManager.cs
index d42384fcbad..f5dbca52588 100644
--- a/libs/cluster/Server/Replication/ReplicationManager.cs
+++ b/libs/cluster/Server/Replication/ReplicationManager.cs
@@ -509,20 +509,20 @@ public void Dispose()
///
/// Main recover method for replication
///
- public void Recover()
+ public async ValueTask RecoverAsync()
{
var nodeRole = clusterProvider.clusterManager.CurrentConfig.LocalNodeRole;
switch (nodeRole)
{
case NodeRole.PRIMARY:
- RecoverCheckpointAndAOF();
+ await RecoverCheckpointAndAOFAsync().ConfigureAwait(false);
break;
case NodeRole.REPLICA:
// If configured, load from disk - otherwise wait to connect with a Primary
if (clusterProvider.serverOptions.ClusterReplicaResumeWithData)
{
- RecoverCheckpointAndAOF();
+ await RecoverCheckpointAndAOFAsync().ConfigureAwait(false);
}
break;
@@ -535,10 +535,10 @@ public void Recover()
///
/// Recover whatever is available from .
///
- private void RecoverCheckpointAndAOF()
+ private async ValueTask RecoverCheckpointAndAOFAsync()
{
- storeWrapper.RecoverCheckpoint();
- storeWrapper.RecoverAOF();
+ await storeWrapper.RecoverCheckpointAsync().ConfigureAwait(false);
+ await storeWrapper.RecoverAOFAsync().ConfigureAwait(false);
if (clusterProvider.serverOptions.EnableAOF)
{
// If recovered checkpoint corresponds to an unavailable AOF address, we initialize AOF to that address
@@ -555,7 +555,7 @@ private void RecoverCheckpointAndAOF()
// First recover and then load latest checkpoint info in-memory
if (!InitializeCheckpointStore())
- logger?.LogWarning("Failed acquiring latest memory checkpoint metadata at {method}", nameof(RecoverCheckpointAndAOF));
+ logger?.LogWarning("Failed acquiring latest memory checkpoint metadata at {method}", nameof(RecoverCheckpointAndAOFAsync));
}
///
diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs
index 7c96e707e89..eae208466d9 100644
--- a/libs/host/GarnetServer.cs
+++ b/libs/host/GarnetServer.cs
@@ -485,7 +485,9 @@ private GarnetAppendOnlyFile CreateAOF(int dbId)
///
public void Start()
{
- Provider.Recover();
+#pragma warning disable VSTHRD002 // Server startup is synchronous and must complete recovery before accepting connections.
+ Provider.RecoverAsync().AsTask().GetAwaiter().GetResult();
+#pragma warning restore VSTHRD002
for (var i = 0; i < servers.Length; i++)
servers[i].Start();
Provider.Start();
diff --git a/libs/server/AOF/GarnetLog.cs b/libs/server/AOF/GarnetLog.cs
index cd700ee16c5..76517f2ca1e 100644
--- a/libs/server/AOF/GarnetLog.cs
+++ b/libs/server/AOF/GarnetLog.cs
@@ -171,13 +171,8 @@ public AofAddress MemorySizeBytes
}
}
- public void Recover()
- {
- if (singleLog != null)
- singleLog.Recover();
- else
- shardedLog.Recover();
- }
+ public ValueTask RecoverAsync()
+ => singleLog != null ? singleLog.RecoverAsync() : shardedLog.RecoverAsync();
public bool RecoverLatestSequenceNumber(out long recoverUntilSequenceNumber)
{
diff --git a/libs/server/AOF/ShardedLog.cs b/libs/server/AOF/ShardedLog.cs
index 61822105b9a..bf496070563 100644
--- a/libs/server/AOF/ShardedLog.cs
+++ b/libs/server/AOF/ShardedLog.cs
@@ -4,6 +4,7 @@
using System.Diagnostics;
using System.Linq;
using System.Threading;
+using System.Threading.Tasks;
using Garnet.common;
using Microsoft.Extensions.Logging;
using Tsavorite.core;
@@ -166,10 +167,10 @@ public AofAddress MemorySizeBytes
}
}
- public void Recover()
+ public async ValueTask RecoverAsync()
{
foreach (var log in sublog)
- log.Recover();
+ await log.RecoverAsync().ConfigureAwait(false);
}
public void Reset()
diff --git a/libs/server/AOF/SingleLog.cs b/libs/server/AOF/SingleLog.cs
index bf3306522f4..5221916bdaf 100644
--- a/libs/server/AOF/SingleLog.cs
+++ b/libs/server/AOF/SingleLog.cs
@@ -1,6 +1,7 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
+using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Tsavorite.core;
@@ -39,7 +40,7 @@ public class SingleLog(TsavoriteLogSettings logSettings, ILogger logger = null)
public AofAddress MemorySizeBytes => AofAddress.Create(1, value: log.MemorySizeBytes);
- public void Recover() => log.Recover();
+ public ValueTask RecoverAsync() => log.RecoverAsync();
public void Reset() => log.Reset();
public void Dispose()
diff --git a/libs/server/Cluster/IClusterProvider.cs b/libs/server/Cluster/IClusterProvider.cs
index 975adb210cb..79523e2eafa 100644
--- a/libs/server/Cluster/IClusterProvider.cs
+++ b/libs/server/Cluster/IClusterProvider.cs
@@ -101,7 +101,7 @@ public interface IClusterProvider : IDisposable
///
/// Recover the cluster
///
- void Recover();
+ ValueTask RecoverAsync();
///
/// Reset gossip stats
diff --git a/libs/server/Databases/DatabaseManagerBase.cs b/libs/server/Databases/DatabaseManagerBase.cs
index 80384423290..24c2887cbba 100644
--- a/libs/server/Databases/DatabaseManagerBase.cs
+++ b/libs/server/Databases/DatabaseManagerBase.cs
@@ -35,7 +35,7 @@ internal abstract class DatabaseManagerBase : IDatabaseManager
public abstract void ResumeCheckpoints(int dbId);
///
- public abstract void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null);
+ public abstract ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null);
///
public abstract Task TakeCheckpointAsync(bool background, int dbId = -1, CancellationToken token = default, ILogger logger = null);
@@ -57,7 +57,7 @@ public abstract Task TaskCheckpointBasedOnAofSizeLimitAsync(long aofSizeLimit,
public abstract Task WaitForCommitToAofAsync(CancellationToken token = default, ILogger logger = null);
///
- public abstract void RecoverAOF();
+ public abstract ValueTask RecoverAOFAsync();
///
public abstract AofAddress ReplayAOF(AofAddress untilAddress);
@@ -164,18 +164,15 @@ protected DatabaseManagerBase(StoreWrapper.DatabaseCreatorDelegate createDatabas
/// Recover single database from checkpoint
///
/// Database to recover
- /// Store version
- protected void RecoverDatabaseCheckpoint(GarnetDatabase db, out long storeVersion)
+ protected async ValueTask RecoverDatabaseCheckpointAsync(GarnetDatabase db)
{
- storeVersion = 0;
-
- storeVersion = db.Store.Recover();
+ var storeVersion = await db.Store.RecoverAsync().ConfigureAwait(false);
Logger?.LogInformation("Recovered store to version {storeVersion}", storeVersion);
if (storeVersion > 0)
- {
db.LastSaveTime = DateTimeOffset.UtcNow;
- }
+
+ return storeVersion;
}
///
@@ -227,11 +224,11 @@ protected static void ResumeCheckpoints(GarnetDatabase db)
/// Recover a single database from AOF
///
/// Database to recover
- protected void RecoverDatabaseAOF(GarnetDatabase db)
+ protected async ValueTask RecoverDatabaseAOFAsync(GarnetDatabase db)
{
if (db.AppendOnlyFile == null) return;
- db.AppendOnlyFile.Log.Recover();
+ await db.AppendOnlyFile.Log.RecoverAsync().ConfigureAwait(false);
Logger?.LogInformation("Recovered AOF: begin address = {beginAddress}, tail address = {tailAddress}, DB ID: {id}",
db.AppendOnlyFile.Log.BeginAddress, db.AppendOnlyFile.Log.TailAddress, db.Id);
}
diff --git a/libs/server/Databases/IDatabaseManager.cs b/libs/server/Databases/IDatabaseManager.cs
index e59176dba23..bf53e679797 100644
--- a/libs/server/Databases/IDatabaseManager.cs
+++ b/libs/server/Databases/IDatabaseManager.cs
@@ -82,7 +82,7 @@ public interface IDatabaseManager : IDisposable
///
///
///
- public void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null);
+ public ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null);
///
/// Take checkpoint of all active databases (or a specified database) if checkpointing is not in progress
@@ -140,7 +140,7 @@ public Task TaskCheckpointBasedOnAofSizeLimitAsync(long aofSizeLimit, Cancellati
///
/// Recover AOF
///
- public void RecoverAOF();
+ public ValueTask RecoverAOFAsync();
///
/// When replaying AOF we do not want to write AOF records again.
diff --git a/libs/server/Databases/MultiDatabaseManager.cs b/libs/server/Databases/MultiDatabaseManager.cs
index 1dfbd7e4324..c8b6a3c869f 100644
--- a/libs/server/Databases/MultiDatabaseManager.cs
+++ b/libs/server/Databases/MultiDatabaseManager.cs
@@ -81,11 +81,11 @@ public MultiDatabaseManager(SingleDatabaseManager src) :
}
///
- public override void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null)
+ public override async ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null)
{
if (replicaRecover)
throw new GarnetException(
- $"Unexpected call to {nameof(MultiDatabaseManager)}.{nameof(RecoverCheckpoint)} with {nameof(replicaRecover)} == true.");
+ $"Unexpected call to {nameof(MultiDatabaseManager)}.{nameof(RecoverCheckpointAsync)} with {nameof(replicaRecover)} == true.");
var checkpointParentDir = StoreWrapper.serverOptions.StoreCheckpointBaseDirectory;
var checkpointDirBaseName = GarnetServerOptions.GetCheckpointDirectoryName(0);
@@ -116,7 +116,7 @@ public override void RecoverCheckpoint(bool replicaRecover = false, bool recover
try
{
- RecoverDatabaseCheckpoint(db, out storeVersion);
+ storeVersion = await RecoverDatabaseCheckpointAsync(db).ConfigureAwait(false);
}
catch (TsavoriteNoHybridLogException ex)
{
@@ -416,7 +416,7 @@ public override async Task WaitForCommitToAofAsync(CancellationToken token = def
}
///
- public override void RecoverAOF()
+ public override async ValueTask RecoverAOFAsync()
{
var aofParentDir = StoreWrapper.serverOptions.AppendOnlyFileBaseDirectory;
var aofDirBaseName = GarnetServerOptions.GetAppendOnlyFileDirectoryName(0);
@@ -442,7 +442,7 @@ public override void RecoverAOF()
if (!success)
throw new GarnetException($"Failed to retrieve or create database for AOF recovery (DB ID = {dbId}).");
- RecoverDatabaseAOF(db);
+ await RecoverDatabaseAOFAsync(db).ConfigureAwait(false);
}
}
diff --git a/libs/server/Databases/SingleDatabaseManager.cs b/libs/server/Databases/SingleDatabaseManager.cs
index 28508436cfa..ce9e6ab8197 100644
--- a/libs/server/Databases/SingleDatabaseManager.cs
+++ b/libs/server/Databases/SingleDatabaseManager.cs
@@ -54,7 +54,7 @@ public override GarnetDatabase TryGetOrAddDatabase(int dbId, out bool success, o
}
///
- public override void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null)
+ public override async ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null)
{
long storeVersion = 0;
try
@@ -64,7 +64,9 @@ public override void RecoverCheckpoint(bool replicaRecover = false, bool recover
// Note: Since replicaRecover only pertains to cluster-mode, we can use the default store pointers (since multi-db mode is disabled in cluster-mode)
if (metadata!.storeIndexToken != default && metadata.storeHlogToken != default)
{
- storeVersion = !recoverFromToken ? Store.Recover() : Store.Recover(metadata.storeIndexToken, metadata.storeHlogToken);
+ storeVersion = !recoverFromToken
+ ? await Store.RecoverAsync().ConfigureAwait(false)
+ : await Store.RecoverAsync(metadata.storeIndexToken, metadata.storeHlogToken).ConfigureAwait(false);
}
if (storeVersion > 0)
@@ -72,7 +74,7 @@ public override void RecoverCheckpoint(bool replicaRecover = false, bool recover
}
else
{
- RecoverDatabaseCheckpoint(defaultDatabase, out storeVersion);
+ storeVersion = await RecoverDatabaseCheckpointAsync(defaultDatabase).ConfigureAwait(false);
}
}
catch (TsavoriteNoHybridLogException ex)
@@ -239,7 +241,7 @@ public override async Task WaitForCommitToAofAsync(CancellationToken token = def
}
///
- public override void RecoverAOF() => RecoverDatabaseAOF(defaultDatabase);
+ public override ValueTask RecoverAOFAsync() => RecoverDatabaseAOFAsync(defaultDatabase);
///
public override AofAddress ReplayAOF(AofAddress untilAddress)
diff --git a/libs/server/Providers/GarnetProvider.cs b/libs/server/Providers/GarnetProvider.cs
index c27819f53f6..3c766d9ebef 100644
--- a/libs/server/Providers/GarnetProvider.cs
+++ b/libs/server/Providers/GarnetProvider.cs
@@ -2,6 +2,7 @@
// Licensed under the MIT license.
using System.Threading;
+using System.Threading.Tasks;
using Garnet.common;
using Garnet.networking;
using Tsavorite.core;
@@ -43,8 +44,8 @@ public void Start()
///
/// Recover
///
- public void Recover()
- => storeWrapper.Recover();
+ public ValueTask RecoverAsync()
+ => storeWrapper.RecoverAsync();
///
/// Dispose
diff --git a/libs/server/StoreWrapper.cs b/libs/server/StoreWrapper.cs
index cbc44a024c0..dfc81e3b438 100644
--- a/libs/server/StoreWrapper.cs
+++ b/libs/server/StoreWrapper.cs
@@ -360,19 +360,19 @@ public IPEndPoint GetClusterEndpoint()
return localEndPoint;
}
- internal void Recover()
+ internal async ValueTask RecoverAsync()
{
if (serverOptions.EnableCluster)
{
if (serverOptions.Recover)
- clusterProvider.Recover();
+ await clusterProvider.RecoverAsync().ConfigureAwait(false);
}
else
{
if (serverOptions.Recover)
{
- RecoverCheckpoint();
- RecoverAOF();
+ await RecoverCheckpointAsync().ConfigureAwait(false);
+ await RecoverAOFAsync().ConfigureAwait(false);
ReplayAOF(AofAddress.Create(length: serverOptions.AofPhysicalSublogCount, value: -1));
}
}
@@ -413,10 +413,10 @@ public async Task TakeOnDemandCheckpointAsync(DateTimeOffset entryTime, int dbId
///
/// Recover checkpoint
///
- public void RecoverCheckpoint(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null)
+ public async ValueTask RecoverCheckpointAsync(bool replicaRecover = false, bool recoverFromToken = false, CheckpointMetadata metadata = null)
{
StartSizeTrackers(); // We need to start this before recovery to have size tracking during the recovery process.
- databaseManager.RecoverCheckpoint(replicaRecover, recoverFromToken, metadata);
+ await databaseManager.RecoverCheckpointAsync(replicaRecover, recoverFromToken, metadata).ConfigureAwait(false);
}
///
@@ -447,7 +447,7 @@ public void ResumeCheckpoints(int dbId = 0)
///
/// Recover AOF
///
- public void RecoverAOF() => databaseManager.RecoverAOF();
+ public ValueTask RecoverAOFAsync() => databaseManager.RecoverAOFAsync();
///
/// When replaying AOF we do not want to write AOF records again.
diff --git a/libs/storage/Tsavorite/cs/benchmark/YCSB.benchmark/TestLoader.cs b/libs/storage/Tsavorite/cs/benchmark/YCSB.benchmark/TestLoader.cs
index fe040a586e0..fe605e77dcd 100644
--- a/libs/storage/Tsavorite/cs/benchmark/YCSB.benchmark/TestLoader.cs
+++ b/libs/storage/Tsavorite/cs/benchmark/YCSB.benchmark/TestLoader.cs
@@ -386,7 +386,7 @@ internal bool MaybeRecoverStore(TsavoriteKV store)
try
{
var sw = Stopwatch.StartNew();
- store.Recover();
+ store.RecoverAsync().AsTask().GetAwaiter().GetResult();
sw.Stop();
Console.WriteLine($" Completed recovery in {(double)sw.ElapsedMilliseconds / 1000:N3} seconds");
return true;
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs
index c3cf6d505a0..51ca3b76565 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs
@@ -33,6 +33,10 @@ public abstract class AllocatorBase
internal virtual ObjectLogFilePositionInfo GetObjectLogTail() => new(); // This marks it as "unset"
/// Set the ObjectLog tail position, if this is ObjectAllocator.
internal virtual void SetObjectLogTail(ObjectLogFilePositionInfo tail) { }
+ /// Calculate the total serialized object size on a loaded page. Only implemented by ObjectAllocator.
+ internal virtual long CalculatePageObjectSizes(long page, long startAddress, long untilAddress) => 0;
+ /// Load objects for records on an already-loaded page for recovery pass 2.
+ internal virtual void LoadObjectsForRecoveryPass2(long page, long fromAddress, long untilAddress, IDevice objectLogDevice) { }
}
///
@@ -546,8 +550,8 @@ private protected AllocatorBase(AllocatorSettings allocatorSettings, TStoreFunct
throw new TsavoriteException($"{nameof(logSettings.SegmentSizeBits)} must be between {LogSettings.kMinMainLogSegmentSizeBits} and {LogSettings.kMaxSegmentSizeBits}");
if (logSettings.MemorySize != 0 && (logSettings.MemorySize < 1L << LogSettings.kMinMemorySizeBits || logSettings.MemorySize > 1L << LogSettings.kMaxMemorySizeBits))
throw new TsavoriteException($"{nameof(logSettings.MemorySize)} must be between {1L << LogSettings.kMinMemorySizeBits} and {1L << LogSettings.kMaxMemorySizeBits}, or may be 0 for ReadOnly TsavoriteLog");
- if ((logSettings.MemorySize != 0) && (logSettings.MemorySize < (1L << logSettings.PageSizeBits) * 2))
- throw new TsavoriteException($"{nameof(logSettings.MemorySize)} must be at least twice the page size ({1L << logSettings.PageSizeBits})");
+ if ((logSettings.MemorySize != 0) && (logSettings.MemorySize < (1L << logSettings.PageSizeBits) * LogSettings.kMinPageCount))
+ throw new TsavoriteException($"{nameof(logSettings.MemorySize)} must be at least {LogSettings.kMinPageCount}x the page size ({1L << logSettings.PageSizeBits})");
if (logSettings.MutableFraction < 0.0 || logSettings.MutableFraction > 1.0)
throw new TsavoriteException($"{nameof(logSettings.MutableFraction)} must be >= 0.0 and <= 1.0");
if (logSettings.ReadCacheSettings is not null)
@@ -926,13 +930,7 @@ void AllocatePagesWithException(int pageIndex, PageOffset localTailPageOffset, i
{
try
{
- // Allocate this page, if needed
- if (!IsAllocated(pageIndex % BufferSize))
- _wrapper.AllocatePage(pageIndex % BufferSize);
-
- // Allocate next page in advance, if needed
- if (!IsAllocated((pageIndex + 1) % BufferSize))
- _wrapper.AllocatePage((pageIndex + 1) % BufferSize);
+ AllocateCurrentAndNextPage(pageIndex);
}
catch
{
@@ -943,6 +941,25 @@ void AllocatePagesWithException(int pageIndex, PageOffset localTailPageOffset, i
}
}
+ ///
+ /// Allocate the page containing and, as the allocator's allocate-ahead invariant, the page following it, each only if it is
+ /// not already allocated.
+ ///
+ /// The page number whose page (and the next page) should be allocated.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ void AllocateCurrentAndNextPage(long page)
+ {
+ // Allocate the current page, if needed.
+ var pageIndex = (int)(page % BufferSize);
+ if (!IsAllocated(pageIndex))
+ _wrapper.AllocatePage(pageIndex);
+
+ // Allocate the next page in advance (an invariant in the allocator), if needed.
+ var nextPageIndex = (pageIndex + 1) % BufferSize;
+ if (!IsAllocated(nextPageIndex))
+ _wrapper.AllocatePage(nextPageIndex);
+ }
+
///
/// Shift log read-only address, with an optional wait
///
@@ -1027,7 +1044,7 @@ bool NeedToShiftAddress(long pageIndex, PageOffset localTailPageOffset, int numS
// First check whether we need to shift HeadAddress. If we have a logSizeTracker that's over budget then we have already issued
// a shift if needed (and allowed by allocated page count); otherwise make sure we stay in the MaxAllocatedPageCount (which may be less than BufferSize).
var desiredHeadAddress = HeadAddress;
- if (logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimit)
+ if (logSizeTracker is null || !logSizeTracker.IsOverBudget)
{
var headPage = GetPage(desiredHeadAddress);
if (pageIndex - headPage >= MaxAllocatedPageCount)
@@ -1062,7 +1079,7 @@ void IssueShiftAddress(long pageIndex, bool needSHA)
// First check whether we need to shift HeadAddress. If we are not forcing for flush and have a logSizeTracker that's over budget then we have already issued
// a shift if needed (and allowed by allocated page count); otherwise make sure we stay in the MaxAllocatedPageCount (which may be less than BufferSize).
var desiredHeadAddress = HeadAddress;
- if (needSHA || logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimit)
+ if (needSHA || logSizeTracker is null || !logSizeTracker.IsOverBudget)
{
var headPage = GetPage(desiredHeadAddress);
if (pageIndex - headPage >= MaxAllocatedPageCount)
@@ -1396,6 +1413,13 @@ public void ShiftBeginAddress(long newBeginAddress, bool truncateLog, bool noFlu
}
}
+ /// Find the head address cutoff on a page for partial object loading. Only implemented by ObjectAllocator.
+ internal virtual long FindHeadAddressCutoffOnPage(long page, long untilAddress, long totalPageObjectSize, int numPagesBelowCurrentPage, long remainingBudget, out int numPagesBelowToEvict)
+ {
+ numPagesBelowToEvict = 0;
+ return GetFirstValidLogicalAddressOnPage(page);
+ }
+
/// Invokes eviction observer if set and then frees the page.
internal void EvictPageForRecovery(long page)
{
@@ -1405,10 +1429,11 @@ internal void EvictPageForRecovery(long page)
var source = IsReadCache ? EvictionSource.ReadCache : EvictionSource.MainLog;
// Per-record eviction walk handles internal heap accounting (key + value via
- // logSizeTracker) and optionally notifies the application via OnEvict.
+ // logSizeTracker) and optionally notifies the application via OnEvict. isRecovery: true so that pages whose
+ // object load was deferred (empty ObjectIdMap, un-deserialized object/overflow slots) are skipped.
if (logSizeTracker is not null || storeFunctions.CallOnEvict)
{
- _wrapper.EvictRecordsInRange(start, end, source);
+ _wrapper.EvictRecordsInRange(start, end, source, isRecovery: true);
}
if (onEvictionObserver is not null)
{
@@ -1496,7 +1521,7 @@ private void OnPagesClosedWorker()
// via OnEvict for app-level cleanup.
var evictSource = IsReadCache ? EvictionSource.ReadCache : EvictionSource.MainLog;
if (logSizeTracker is not null || storeFunctions.CallOnEvict)
- _wrapper.EvictRecordsInRange(start, end, evictSource);
+ _wrapper.EvictRecordsInRange(start, end, evictSource, isRecovery: false);
// If we are using a null storage device, we must also shift BeginAddress (leave it in-memory)
if (IsNullDevice)
@@ -1632,15 +1657,8 @@ protected internal virtual void RecoveryReset(long tailAddress, long headAddress
if (pageHeaderSize > 0 && TailPageOffset.Offset == 0)
TailPageOffset.Offset = pageHeaderSize;
- // Allocate current page if necessary
- var pageIndex = TailPageOffset.Page % BufferSize;
- if (!IsAllocated(pageIndex))
- _wrapper.AllocatePage(pageIndex);
-
- // Allocate next page as well - this is an invariant in the allocator!
- var nextPageIndex = (pageIndex + 1) % BufferSize;
- if (!IsAllocated(nextPageIndex))
- _wrapper.AllocatePage(nextPageIndex);
+ // Allocate the current page and the next page (the allocate-ahead invariant) if necessary.
+ AllocateCurrentAndNextPage(TailPageOffset.Page);
BeginAddress = beginAddress;
HeadAddress = headAddress;
@@ -1651,7 +1669,7 @@ protected internal virtual void RecoveryReset(long tailAddress, long headAddress
SafeReadOnlyAddress = readonlyAddress;
// for the last page which contains tailoffset, it must be open
- pageIndex = GetPageIndexForAddress(tailAddress);
+ var pageIndex = GetPageIndexForAddress(tailAddress);
// clear the last page starting from tail address
ClearPage(pageIndex, (int)GetOffsetOnPage(tailAddress));
@@ -1718,14 +1736,14 @@ private SectorAlignedMemory GetAndPopulateReadBuffer(long fromLogicalAddress, in
/// Read pages from specified device(s) for recovery, with no output of the countdown event (but it is still created in the
/// and thus must be Dispose()d).
- public void AsyncReadPagesForRecovery(long readPageStart, int numPages, long untilAddress, TContext context,
- long devicePageOffset = 0, IDevice logDevice = null, IDevice objectLogDevice = null)
- => AsyncReadPagesForRecovery(readPageStart, numPages, untilAddress, context, out _, devicePageOffset, logDevice, objectLogDevice);
+ internal void AsyncReadPagesForRecovery(long readPageStart, int numPages, long untilAddress, TContext context,
+ long devicePageOffset = 0, IDevice logDevice = null, IDevice objectLogDevice = null, RecoveryPhase recoveryPhase = RecoveryPhase.Pass1)
+ => AsyncReadPagesForRecovery(readPageStart, numPages, untilAddress, context, out _, devicePageOffset, logDevice, objectLogDevice, recoveryPhase);
/// Read pages from specified device for recovery, returning the countdown event
[MethodImpl(MethodImplOptions.NoInlining)]
private void AsyncReadPagesForRecovery(long readPageStart, int numPages, long untilAddress, TContext context,
- out CountdownEvent completed, long devicePageOffset = 0, IDevice logDevice = null, IDevice objectLogDevice = null)
+ out CountdownEvent completed, long devicePageOffset = 0, IDevice logDevice = null, IDevice objectLogDevice = null, RecoveryPhase recoveryPhase = RecoveryPhase.Pass1)
{
var usedDevice = logDevice ?? this.device;
@@ -1745,7 +1763,7 @@ private void AsyncReadPagesForRecovery(long readPageStart, int numPage
context = context,
handle = completed,
maxAddressOffsetOnPage = PageSize,
- isForRecovery = true
+ recoveryPhase = recoveryPhase
};
var offsetInFile = (ulong)(AlignedPageSizeBytes * readPage);
@@ -1764,9 +1782,12 @@ private void AsyncReadPagesForRecovery(long readPageStart, int numPage
if (logDevice != null)
offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset));
- // Create separate readBuffers for each main-log page, as each page launches its own async read and callbacks are on different threads.
- // Do *not* use "using" here as we need it to survive to the ReadAsync AsyncReadPagesForRecoveryCallback.
- asyncResult.readBuffers = CreateCircularReadBuffers(objectLogDevice, logger);
+ if (recoveryPhase == RecoveryPhase.Pass2)
+ {
+ // Create separate readBuffers for each main-log page, as each page launches its own async read and callbacks are on different threads.
+ // Do *not* use "using" here as we need it to survive to the ReadAsync AsyncReadPagesForRecoveryCallback.
+ asyncResult.readBuffers = CreateCircularReadBuffers(objectLogDevice, logger);
+ }
// Call the overridden ReadAsync for the derived allocator class
ReadAsync(offsetInFile, (IntPtr)pagePointers[pageIndex], readLength, AsyncReadPagesForRecoveryCallback, asyncResult, usedDevice);
@@ -1881,11 +1902,23 @@ private protected bool PrepareFlushAsyncResult(long fromAddress, long untilAddre
}
///
- /// Flush pages asynchronously for recovery (such as when we have invalidated v+1 records).
+ /// Flush pages asynchronously for recovery (such as when we have invalidated v+1 records, or when replaying snapshot pages into the main log).
///
- public void AsyncFlushPagesForRecovery(long scanFromAddress, long flushPageStart, int numPages, DeviceIOCompletionCallback callback, TContext context)
+ /// The lowest address being flushed on
+ /// First page to flush
+ /// Number of pages to flush
+ /// Flush completion callback
+ /// Callback context
+ /// For the snapshot-replay flush, the snapshot object-log device whose object bytes (for records at/above
+ /// ) are copied into the main object-log during the flush. Null for non-object or hybrid-log-only flushes.
+ /// The former FlushedUntilAddress (hybrid-log/snapshot boundary); records at/above it have their objects copied.
+ public void AsyncFlushPagesForRecovery(long scanFromAddress, long flushPageStart, int numPages, DeviceIOCompletionCallback callback, TContext context,
+ IDevice snapshotObjectLogDevice = null, long formerFlushedUntilAddress = long.MaxValue)
{
Debug.Assert(scanFromAddress < GetLogicalAddressOfStartOfPage(flushPageStart + 1), $"scanFromAddress ({scanFromAddress}) must be on flushPageStart ({flushPageStart})");
+
+ // When copying snapshot object bytes into the main object-log, we need write buffers on the main object-log device (as for a normal flush).
+ var copyObjects = snapshotObjectLogDevice is not null;
for (var flushPage = flushPageStart; flushPage < (flushPageStart + numPages); flushPage++)
{
var asyncResult = new PageAsyncFlushResult()
@@ -1896,11 +1929,14 @@ public void AsyncFlushPagesForRecovery(long scanFromAddress, long flus
partial = false,
fromAddress = Math.Max(scanFromAddress, GetLogicalAddressOfStartOfPage(flushPage)),
untilAddress = GetLogicalAddressOfStartOfPage(flushPage + 1),
- flushRequestState = FlushRequestState.Recovery
+ flushRequestState = FlushRequestState.Recovery,
+ recoverySnapshotObjectLogDevice = snapshotObjectLogDevice,
+ recoveryFormerFlushedUntilAddress = formerFlushedUntilAddress,
+ flushBuffers = copyObjects ? CreateCircularFlushBuffers(objectLogDevice: null, logger) : null
};
- // For OA, we do not use FlushBuffers here; we set isForRecovery to reuse the stored lengths rather than re-serializing objects,
- // using the lengths filled in during deserialization in RecoverHybridLog(Async), and when that is complete we fill in objectLogTail.
+ // For the snapshot region (records at/above formerFlushedUntilAddress) we copy object bytes from the snapshot object-log to the main
+ // object-log using flushBuffers; otherwise (hybrid-log region) we reuse the stored lengths/positions without writing object bytes.
WriteAsync(flushPage, callback, asyncResult);
}
}
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs
index 6c1fe626382..6673eddb72d 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs
@@ -100,6 +100,9 @@ RecordSizeInfo GetDeleteRecordSize(TKey key)
/// Return the for transient log records (e.g. iterator)
ObjectIdMap TransientObjectIdMap { get; }
+ /// Return the for a specific page number (not index)
+ ObjectIdMap GetPageObjectIdMap(long pageNumber);
+
/// Dispose an in-memory log record
void OnDispose(ref LogRecord logRecord, DisposeReason disposeReason);
@@ -116,6 +119,9 @@ RecordSizeInfo GetDeleteRecordSize(TKey key)
/// Start logical address of the range.
/// End logical address of the range (exclusive).
/// Identifies whether this eviction is from the main log or the read cache.
- void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source);
+ /// True when called during recovery, where a page's object load may have been deferred — such a page has an empty
+ /// ObjectIdMap and per-record object/overflow slots that still hold raw on-disk values (not valid ObjectIdMap ids) and is skipped. False for
+ /// normal eviction, where an empty map simply means an object-free page whose inline records must still be visited.
+ void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery);
}
}
\ No newline at end of file
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/LogRecord.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/LogRecord.cs
index 5ad957a162c..58854f250a4 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/LogRecord.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/LogRecord.cs
@@ -1549,6 +1549,32 @@ internal readonly void SetObjectLogRecordStartPositionAndLength(in ObjectLogFile
SetDataHeader(dataHeader);
}
+ ///
+ /// Repoints this record's object-log position word to without touching the R11-encoded
+ /// key/value lengths (in the RDH fields and the int* slots at keyAddress/valueAddress) or the .
+ ///
+ /// The new object-log position (e.g. the main object-log position a snapshot record's bytes were copied to).
+ ///
+ /// Used by the snapshot-recovery flush, which copies a record's object bytes from the snapshot object-log to the main object-log and must
+ /// repoint the disk-image record to the main position. The record's objects are NOT deserialized at this point (objectIdMap is empty and the
+ /// int* slots still hold the on-disk R11 length high-bits), so unlike and
+ /// this must not read the lengths from objectIdMap. The existing R11 length encoding
+ /// is preserved as-is, since the copied lengths are unchanged.
+ /// IMPORTANT: Like the other position setters, this is only safe to call on the disk-image copy of the record (srcBuffer).
+ ///
+ internal readonly void RepointObjectLogPosition(in ObjectLogFilePositionInfo objectLogFilePosition)
+ {
+ if (DataHeader.RecordIsInline)
+ {
+ Debug.Fail("Cannot call RepointObjectLogPosition for an inline record");
+ return;
+ }
+
+ var (valueLength, valueAddress) = DataHeader.GetValueFieldInfo(physicalAddress);
+ var objectLogPositionPtr = (ulong*)GetObjectLogPositionAddress(valueAddress + valueLength);
+ *objectLogPositionPtr = objectLogFilePosition.word | ObjectLogFilePositionInfo.kReuseObjectIdForSizeMask;
+ }
+
///
/// Returns the object log position for the start of the key (if any) and value (if any), with the length encoded per R11:
/// (low N bits from RDH KeyLength/ValueLength) + (next 32 bits from int* slot at keyAddress/valueAddress).
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/MallocFixedPageSize.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/MallocFixedPageSize.cs
index 02fa68910e7..8604c1250b9 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/MallocFixedPageSize.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/MallocFixedPageSize.cs
@@ -370,18 +370,6 @@ private unsafe void AsyncFlushCallback(uint errorCode, uint numBytes, object con
#endregion
#region Recover
- ///
- /// Recover
- ///
- ///
- ///
- ///
- ///
- public void Recover(IDevice device, ulong offset, int buckets, ulong numBytes)
- {
- BeginRecovery(device, offset, buckets, numBytes, out _);
- }
-
///
/// Recover
///
@@ -397,22 +385,6 @@ public async ValueTask RecoverAsync(IDevice device, ulong offset, int buc
return numBytesRead;
}
- ///
- /// Check if recovery complete
- ///
- ///
- ///
- public bool IsRecoveryCompleted(bool waitUntilComplete = false)
- {
- bool completed = recoveryCountdown.IsCompleted;
- if (!completed && waitUntilComplete)
- {
- recoveryCountdown.Wait();
- return true;
- }
- return completed;
- }
-
// Implementation of asynchronous recovery
private CountdownWrapper recoveryCountdown;
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocator.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocator.cs
index 2c8983cb54c..f913b275b10 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocator.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocator.cs
@@ -129,6 +129,9 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key)
///
public readonly ObjectIdMap TransientObjectIdMap => _this.transientObjectIdMap;
+ ///
+ public readonly ObjectIdMap GetPageObjectIdMap(long pageNumber) => _this.objectPages[_this.GetPageIndexForPage(pageNumber)].objectIdMap;
+
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly void OnDispose(ref LogRecord logRecord, DisposeReason disposeReason) => _this.OnDispose(ref logRecord, disposeReason);
@@ -138,6 +141,6 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key)
public readonly void OnDisposeDiskRecord(ref DiskLogRecord logRecord, DisposeReason disposeReason) => _this.OnDisposeDiskRecord(ref logRecord, disposeReason);
///
- public readonly void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source) => _this.EvictRecordsInRange(startAddress, endAddress, source);
+ public readonly void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery) => _this.EvictRecordsInRange(startAddress, endAddress, source, isRecovery);
}
}
\ No newline at end of file
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocatorImpl.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocatorImpl.cs
index 4344f27eed7..2b25d7fd5d8 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocatorImpl.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectAllocatorImpl.cs
@@ -384,7 +384,7 @@ internal void OnDisposeDiskRecord(ref DiskLogRecord logRecord, DisposeReason dis
/// ), so this routine walks records
/// within that single page only.
///
- internal void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source)
+ internal void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery)
{
var startPage = GetPage(startAddress);
var firstValidAddress = GetFirstValidLogicalAddressOnPage(startPage);
@@ -392,10 +392,17 @@ internal void EvictRecordsInRange(long startAddress, long endAddress, EvictionSo
var pageEndAddress = GetLogicalAddressOfStartOfPage(startPage + 1);
var stopAddress = endAddress < pageEndAddress ? endAddress : pageEndAddress;
+ // During recovery a page whose object load was deferred has an empty ObjectIdMap, and its per-record object/overflow slots still hold raw
+ // on-disk values, not valid ObjectIdMap ids; they must not be dereferenced and the records are not yet materialized, so skip the page.
+ // (In normal eviction an empty map merely means an object-free page, whose inline records must still be visited for OnEvict.)
+ var objectIdMap = objectPages[GetPageIndexForAddress(address)].objectIdMap;
+ if (isRecovery && objectIdMap.IsEmpty)
+ return;
+
while (address < stopAddress)
{
var physicalAddress = GetPhysicalAddress(address);
- var logRecord = new LogRecord(physicalAddress, objectPages[GetPageIndexForAddress(address)].objectIdMap);
+ var logRecord = new LogRecord(physicalAddress, objectIdMap);
var allocatedSize = logRecord.AllocatedSize;
if (allocatedSize <= 0)
@@ -765,6 +772,14 @@ private void WriteAsync(long flushPage, ulong alignedMainLogFlushPageA
// Overflow Keys and Values are written to, and Object values are serialized to, this Stream, if we have flushBuffers.
ObjectLogWriter logWriter = null;
+ // For a snapshot-region recovery flush, the reader over the snapshot object-log device from which each record's object bytes are
+ // copied into the main object-log (appended via logWriter). Null for non-recovery flushes and for the hybrid-log region (whose
+ // objects are already durable in the main object-log).
+ var isSnapshotRecoveryCopy = asyncResult.recoverySnapshotObjectLogDevice is not null;
+ var formerFlushedUntilAddress = asyncResult.recoveryFormerFlushedUntilAddress;
+ CircularDiskReadBuffer snapshotObjectReadBuffers = null;
+ ObjectLogReader snapshotObjectReader = null;
+
// Do everything below here in the try{} to be sure the epoch is Resumed()d if we Suspended it.
SectorAlignedMemory srcBuffer = default;
try
@@ -830,6 +845,7 @@ private void WriteAsync(long flushPage, ulong alignedMainLogFlushPageA
var recoveryOngoingPageHeader = asyncResult.flushRequestState == FlushRequestState.Recovery ? pageHeader.GetLowestObjectLogPosition(objectLogTail.SegmentSizeBits) : default;
var endLogicalAddress = logicalAddress + (endPhysicalAddress - physicalAddress);
+
while (physicalAddress < endPhysicalAddress)
{
// Increment for next iteration; use allocatedSize because that is what LogicalAddress is based on.
@@ -908,10 +924,38 @@ private void WriteAsync(long flushPage, ulong alignedMainLogFlushPageA
}
else
{
- // In recovery we just need to update the disk-image LogRecord with the object lengths and file position, and then
- // advance the recoveryOngoingPageHeader position. This advancement will also take care of segment breaks if needed.
- var objectLengths = logRecord.SetRecoveredObjectLogRecordStartPosition(recoveryOngoingPageHeader);
- recoveryOngoingPageHeader.Advance(objectLengths);
+ if (isSnapshotRecoveryCopy && logicalAddress >= formerFlushedUntilAddress)
+ {
+ // Snapshot-region recovery flush: the record's objects live only in the snapshot object-log. Copy their bytes
+ // into the main object-log (appended at the current objectLogTail via logWriter) so the page becomes durable and
+ // can be evicted, then repoint the disk-image record to that main object-log position. The objects are NOT
+ // deserialized at this point, so read the position/lengths from the R11 encoding (not from objectIdMap), and use
+ // RepointObjectLogPosition (which preserves the unchanged lengths) rather than SetRecoveredObjectLogRecordStartPosition.
+ var snapshotPositionWord = logRecord.GetObjectLogRecordStartPositionAndLengths(out var copyKeyLength, out var copyValueLength);
+ var copyObjectLength = (ulong)copyKeyLength + copyValueLength;
+
+ // Demand-load the snapshot object reader on the first valid record with objects, so pages with few or no object
+ // records avoid an up-front full-page pre-pass. The read-ahead range is sized by scanning forward from here.
+ snapshotObjectReader ??= CreateSnapshotObjectReader(physicalAddress + logRecordSize, endPhysicalAddress, snapshotPositionWord,
+ copyKeyLength, copyValueLength, asyncResult.recoverySnapshotObjectLogDevice, out snapshotObjectReadBuffers);
+
+ var mainRecordPosition = logWriter.GetNextRecordStartPosition();
+
+ // Position/await the snapshot read buffers at this record (skips sector padding and waits for the read-ahead IO),
+ // then stream the record's bytes verbatim into the main object-log.
+ if (!snapshotObjectReadBuffers.OnBeginRecord(new ObjectLogFilePositionInfo(snapshotPositionWord, objectLogTail.SegmentSizeBits)))
+ throw new TsavoriteException("No snapshot object-log data available while copying objects during recovery");
+ logWriter.CopyRecoveredObjectBytes(snapshotObjectReader, copyObjectLength);
+ logRecord.RepointObjectLogPosition(mainRecordPosition);
+ recoveryOngoingPageHeader.Advance(copyObjectLength);
+ }
+ else
+ {
+ // In recovery we just need to update the disk-image LogRecord with the object lengths and file position, and then
+ // advance the recoveryOngoingPageHeader position. This advancement will also take care of segment breaks if needed.
+ var objectLengths = logRecord.SetRecoveredObjectLogRecordStartPosition(recoveryOngoingPageHeader);
+ recoveryOngoingPageHeader.Advance(objectLengths);
+ }
}
// Do this for both cases so it's clear when debugging
@@ -956,7 +1000,43 @@ private void WriteAsync(long flushPage, ulong alignedMainLogFlushPageA
if (protectEpochWhenDone)
epoch.Resume();
logWriter?.Dispose();
+ snapshotObjectReader?.OnEndReadRecords();
+ snapshotObjectReadBuffers?.Dispose();
+ }
+ }
+
+ ///
+ /// Demand-loads (creates and seeds) the reader over the snapshot object-log for a snapshot-region recovery flush, on the first valid record
+ /// with objects on the page. The read-ahead range is sized by scanning forward from to the last object
+ /// record on the page, so pages with few or no object records avoid an up-front full-page pre-pass.
+ ///
+ /// The (disk-image) address of the record just after the first object record.
+ /// The end of the page's records in the disk image.
+ /// The snapshot object-log position word of the first object record (the read-ahead start).
+ /// The first object record's key length.
+ /// The first object record's value length.
+ /// The snapshot object-log device to read from.
+ /// Outputs the created read buffers; the caller disposes them.
+ private ObjectLogReader CreateSnapshotObjectReader(long nextRecordAddress, long endPhysicalAddress, ulong firstPositionWord,
+ int firstKeyLength, ulong firstValueLength, IDevice snapshotObjectLogDevice, out CircularDiskReadBuffer readBuffers)
+ {
+ var startPosition = new ObjectLogFilePositionInfo(firstPositionWord, objectLogTail.SegmentSizeBits);
+ var endPosition = startPosition;
+ var endKeyLength = firstKeyLength;
+ var endValueLength = firstValueLength;
+ for (var scanAddress = nextRecordAddress; scanAddress < endPhysicalAddress;)
+ {
+ var scanRecord = new LogRecord(scanAddress);
+ scanAddress += scanRecord.AllocatedSize;
+ if (scanRecord.Info.Valid && scanRecord.DataHeader.RecordHasObjects)
+ endPosition = new(scanRecord.GetObjectLogRecordStartPositionAndLengths(out endKeyLength, out endValueLength), objectLogTail.SegmentSizeBits);
}
+ endPosition.Advance((ulong)endKeyLength + endValueLength);
+
+ readBuffers = CreateCircularReadBuffers(snapshotObjectLogDevice, logger);
+ var reader = new ObjectLogReader(readBuffers, storeFunctions);
+ reader.OnBeginReadRecords(startPosition, endPosition - startPosition);
+ return reader;
}
///
@@ -1120,75 +1200,216 @@ private void AsyncReadPageWithObjectsCallback(uint errorCode, uint num
return;
}
- var pageStartAddress = (long)result.destinationPtr;
+ // If this is Recovery Pass 1 we skip object deserialization (frame reads are in RecoveryPhase.None).
+ if (result.recoveryPhase != RecoveryPhase.Pass1)
+ {
+ var objectIdMapToUse = result.recoveryPhase != RecoveryPhase.None ? objectPages[result.page % BufferSize].objectIdMap : transientObjectIdMap;
+ DeserializeObjectsOnPage((long)result.destinationPtr, result.maxAddressOffsetOnPage, objectIdMapToUse, result.readBuffers);
+ }
+
+ // Call the "real" page read callback
+ result.callback(errorCode, numBytes, context);
+ result.Free();
+ }
- // Iterate all records in range to determine how many bytes we need to read from objlog.
+ ///
+ /// Deserialize objects on a page that has already been loaded into memory (physical addresses).
+ /// Scans records to determine object log ranges, reads from the object log via the provided readBuffers, and deserializes objects.
+ ///
+ /// Physical start address of the page in memory
+ /// Maximum offset on the page (PageSize or less for partial pages)
+ /// The ObjectIdMap to use for deserialized objects
+ /// The circular read buffers for object log reading
+ private void DeserializeObjectsOnPage(long pageStartPhysicalAddress, long maxAddressOffsetOnPage, ObjectIdMap objectIdMap, CircularDiskReadBuffer readBuffers)
+ {
ObjectLogFilePositionInfo startPosition = new(), endPosition = new();
var endKeyLength = 0;
ulong endValueLength = 0;
- ulong totalBytesToRead = 0;
- var recordAddress = pageStartAddress + PageHeader.Size;
- var endAddress = pageStartAddress + result.maxAddressOffsetOnPage;
+ var recordAddress = pageStartPhysicalAddress + PageHeader.Size;
+ var endAddress = pageStartPhysicalAddress + maxAddressOffsetOnPage;
+ // First pass: determine the range of object log bytes to read
while (recordAddress < endAddress)
{
- // Increment for next iteration; use allocatedSize because that is what LogicalAddress is based on.
var logRecord = new LogRecord(recordAddress);
recordAddress += logRecord.AllocatedSize;
if (logRecord.DataHeader.RecordHasObjects && logRecord.Info.Valid)
{
- if (!startPosition.IsSet)
- startPosition = new(logRecord.GetObjectLogRecordStartPositionAndLengths(out _, out _), objectLogTail.SegmentSizeBits);
endPosition = new(logRecord.GetObjectLogRecordStartPositionAndLengths(out endKeyLength, out endValueLength), objectLogTail.SegmentSizeBits);
+ if (!startPosition.IsSet)
+ startPosition = endPosition;
}
}
// The page may not have contained any records with objects
- if (startPosition.IsSet)
- {
- endPosition.Advance((ulong)endKeyLength + endValueLength);
- totalBytesToRead = endPosition - startPosition;
+ if (!startPosition.IsSet)
+ return;
- // Iterate all records again to actually do the deserialization.
- result.readBuffers.nextFileReadPosition = startPosition;
- recordAddress = pageStartAddress + PageHeader.Size;
- var logReader = new ObjectLogReader(result.readBuffers, storeFunctions);
- logReader.OnBeginReadRecords(startPosition, totalBytesToRead);
+ endPosition.Advance((ulong)endKeyLength + endValueLength);
+ var totalBytesToRead = endPosition - startPosition;
- var objectIdMapToUse = result.isForRecovery ? objectPages[result.page % BufferSize].objectIdMap : transientObjectIdMap;
+ // Second pass: deserialize objects
+ readBuffers.nextFileReadPosition = startPosition;
+ recordAddress = pageStartPhysicalAddress + PageHeader.Size;
+ var logReader = new ObjectLogReader(readBuffers, storeFunctions);
+ logReader.OnBeginReadRecords(startPosition, totalBytesToRead);
+ try
+ {
while (recordAddress < endAddress)
{
- // Increment for next iteration; use allocatedSize because that is what LogicalAddress is based on.
- var logRecord = new LogRecord(recordAddress, objectIdMapToUse);
+ var logRecord = new LogRecord(recordAddress, objectIdMap);
recordAddress += logRecord.AllocatedSize;
if (logRecord.DataHeader.RecordHasObjects && logRecord.Info.Valid)
{
_ = logReader.ReadRecordObjects(ref logRecord, default(EmptyKey), startPosition.SegmentSizeBits);
- // CalculateHeapMemorySize returns 0 for tombstones, but eviction subtracts
- // key overflow for tombstoned records. Add it here so the tracker stays balanced.
- if (logRecord.Info.Tombstone)
- {
- if (logRecord.DataHeader.KeyIsOverflow)
- logSizeTracker?.IncrementSize(logRecord.KeyOverflow.HeapMemorySize);
- }
- else
- {
- logSizeTracker?.UpdateSize(in logRecord, add: true);
- }
+ TrackRecoveredObjectRecord(in logRecord);
}
}
-
- // Ensure we have finished all object reads
+ }
+ finally
+ {
logReader.OnEndReadRecords();
}
+ }
- // Call the "real" page read callback
- result.callback(errorCode, numBytes, context);
- result.Free();
- return;
+ private void TrackRecoveredObjectRecord(in LogRecord logRecord)
+ {
+ if (logSizeTracker is null)
+ return;
+
+ // CalculateHeapMemorySize returns 0 for tombstones, but eviction subtracts
+ // key overflow for tombstoned records. Add it here so the tracker stays balanced.
+ if (logRecord.Info.Tombstone)
+ {
+ if (logRecord.DataHeader.KeyIsOverflow)
+ logSizeTracker.IncrementSize(logRecord.KeyOverflow.HeapMemorySize);
+ }
+ else
+ logSizeTracker.UpdateSize(in logRecord, add: true);
+ }
+
+ ///
+ internal override long CalculatePageObjectSizes(long page, long startAddress, long untilAddress)
+ {
+ var recordAddress = Math.Max(startAddress, GetFirstValidLogicalAddressOnPage(page));
+ var endAddress = Math.Min(untilAddress, GetLogicalAddressOfStartOfPage(page + 1));
+ long totalSize = 0;
+
+ while (recordAddress < endAddress)
+ {
+ var logRecord = new LogRecord(GetPhysicalAddress(recordAddress));
+ var allocatedSize = logRecord.AllocatedSize;
+ if (allocatedSize <= 0)
+ ThrowTsavoriteException($"LogRecord size should be > 0; encountered {allocatedSize}");
+
+ recordAddress += allocatedSize;
+ if (recordAddress > endAddress)
+ ThrowTsavoriteException($"Unaligned end of page; record exceeded page by {recordAddress - endAddress} bytes");
+
+ if (logRecord.Info.Valid && logRecord.DataHeader.RecordHasObjects)
+ {
+ _ = logRecord.GetObjectLogRecordStartPositionAndLengths(out var keyLength, out var valueLength);
+ totalSize += keyLength + (long)valueLength;
+ }
+ }
+
+ return totalSize;
+ }
+
+ ///
+ /// Determine if this is the last valid record on the page.
+ ///
+ /// Address of the current record
+ /// Address of the end of the page
+ /// True if this is the last valid record on the page, otherwise false
+ private bool IsLastRecordOnPage(long recordAddress, long endAddress, out long nextRecordAddress)
+ {
+ while (recordAddress < endAddress)
+ {
+ var logRecord = new LogRecord(GetPhysicalAddress(recordAddress));
+ var allocatedSize = logRecord.AllocatedSize;
+ if (allocatedSize <= 0)
+ ThrowTsavoriteException($"LogRecord size should be > 0; encountered {allocatedSize}");
+
+ recordAddress += allocatedSize;
+ if (recordAddress > endAddress)
+ ThrowTsavoriteException($"Unaligned end of page; record exceeded page by {recordAddress - endAddress} bytes");
+
+ if (logRecord.Info.Valid)
+ {
+ nextRecordAddress = recordAddress;
+ return false;
+ }
+ }
+ nextRecordAddress = -1L;
+ return true;
+ }
+
+ ///
+ internal override void LoadObjectsForRecoveryPass2(long page, long fromAddress, long untilAddress, IDevice objectLogDevice)
+ {
+ var pageStartAddress = GetFirstValidLogicalAddressOnPage(page);
+ var address = Math.Max(fromAddress, pageStartAddress);
+ var endAddress = Math.Min(untilAddress, GetLogicalAddressOfStartOfPage(page + 1));
+ if (address >= endAddress)
+ return;
+
+ var pagePhysicalAddress = GetPhysicalAddress(GetLogicalAddressOfStartOfPage(page));
+ var maxOffset = endAddress - GetLogicalAddressOfStartOfPage(page);
+ var objectIdMapToUse = objectPages[page % BufferSize].objectIdMap;
+ using var readBuffers = CreateCircularReadBuffers(objectLogDevice, logger);
+ DeserializeObjectsOnPage(pagePhysicalAddress, maxOffset, objectIdMapToUse, readBuffers);
+ }
+
+ ///
+ internal override long FindHeadAddressCutoffOnPage(long page, long untilAddress, long totalPageObjectSize, int numPagesBelowCurrentPage, long remainingBudget, out int numPagesBelowToEvict)
+ {
+ var recordAddress = GetFirstValidLogicalAddressOnPage(page);
+ var stopAddress = Math.Min(untilAddress, GetLogicalAddressOfStartOfPage(page + 1));
+ var overBudgetAmount = totalPageObjectSize - remainingBudget;
+ if (overBudgetAmount <= 0)
+ {
+ numPagesBelowToEvict = 0;
+ return recordAddress;
+ }
+
+ // We are over budget. First see if we can evict enough pages to get below budget.
+ var pagesToEvictToGetUnderBudget = (int)((overBudgetAmount + PageSize - 1) / PageSize);
+ if (pagesToEvictToGetUnderBudget <= numPagesBelowCurrentPage)
+ {
+ // We can, and may even still have some pages left below us that can remain.
+ numPagesBelowToEvict = pagesToEvictToGetUnderBudget;
+ return recordAddress;
+ }
+
+ // We cannot evict enough pages to get under budget. Evict all pages below, and then skip records on this page until we are under budget.
+ numPagesBelowToEvict = numPagesBelowCurrentPage;
+ overBudgetAmount -= (long)numPagesBelowToEvict * PageSize;
+
+ while (recordAddress < stopAddress)
+ {
+ var logRecord = new LogRecord(GetPhysicalAddress(recordAddress));
+ var allocatedSize = logRecord.AllocatedSize;
+ if (allocatedSize <= 0)
+ ThrowTsavoriteException($"LogRecord size should be > 0; encountered {allocatedSize}");
+
+ recordAddress += allocatedSize;
+ if (recordAddress > stopAddress)
+ ThrowTsavoriteException($"Unaligned end of page; record exceeded page by {recordAddress - stopAddress} bytes");
+
+ if (logRecord.Info.Valid && logRecord.DataHeader.RecordHasObjects)
+ {
+ _ = logRecord.GetObjectLogRecordStartPositionAndLengths(out var keyLength, out var valueLength);
+ overBudgetAmount -= keyLength + (long)valueLength;
+ if (overBudgetAmount <= 0)
+ return recordAddress;
+ }
+ }
+
+ return stopAddress;
}
///
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectIdMap.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectIdMap.cs
index 468ac39ddd7..e544dc5a0ad 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectIdMap.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectIdMap.cs
@@ -35,6 +35,8 @@ internal ObjectIdMap()
internal int Count => objectArray.Count;
+ internal bool IsEmpty => objectArray.Count == 0;
+
/// Reserve a slot and return its ID.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int Allocate()
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/ObjectLogWriter.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/ObjectLogWriter.cs
index 94820531b07..ce2bf23fef4 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/ObjectLogWriter.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/ObjectLogWriter.cs
@@ -109,6 +109,43 @@ public ulong WriteRecordObjects(in OverflowByteArray keyOverflow, in OverflowByt
return valueObjectBytesWritten;
}
+ ///
+ /// Copies bytes of a record's serialized object data verbatim from the snapshot object-log (via
+ /// ) into this (main) object-log, then signals record completion. Used by the snapshot-region recovery
+ /// flush, which copies a record's object bytes without deserialize/reserialize. The must already be
+ /// positioned at the record (via ).
+ ///
+ /// The reader over the snapshot object-log, positioned at the record to copy.
+ /// The total number of object-log bytes for the record (key plus value).
+ public void CopyRecoveredObjectBytes(ObjectLogReader reader, ulong totalLength)
+ {
+ if (totalLength > 0)
+ {
+ var buffer = flushBuffers.bufferPool.Get(IStreamBuffer.BufferSize);
+ try
+ {
+ var chunkSpan = buffer.TotalValidSpan;
+ var remaining = totalLength;
+ while (remaining > 0)
+ {
+ var requestLength = (int)Math.Min(remaining, (ulong)chunkSpan.Length);
+ var bytesRead = reader.Read(chunkSpan.Slice(0, requestLength));
+ if (bytesRead == 0)
+ throw new TsavoriteException("Unexpected end of snapshot object-log data while copying objects during recovery");
+ Write(chunkSpan.Slice(0, bytesRead));
+ remaining -= (ulong)bytesRead;
+ }
+ }
+ finally
+ {
+ flushBuffers.bufferPool.Return(buffer);
+ }
+ }
+
+ // Signal completion, as WriteRecordObjects does.
+ flushBuffers.OnRecordComplete();
+ }
+
/// Start off the write using the full span of the .
/// The to write.
void WriteDirect(OverflowByteArray overflow) => WriteDirect(overflow, overflow.ReadOnlySpan, refCountedGCHandle: default);
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/SpanByteAllocator.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/SpanByteAllocator.cs
index 893a1ef900d..41958e418e9 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/SpanByteAllocator.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/SpanByteAllocator.cs
@@ -128,6 +128,9 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key)
///
public readonly ObjectIdMap TransientObjectIdMap => default;
+ ///
+ public readonly ObjectIdMap GetPageObjectIdMap(long pageNumber) => default;
+
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void OnDispose(ref LogRecord logRecord, DisposeReason disposeReason) => _this.OnDispose(ref logRecord, disposeReason);
@@ -137,6 +140,6 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key)
public void OnDisposeDiskRecord(ref DiskLogRecord logRecord, DisposeReason disposeReason) => _this.OnDisposeDiskRecord(ref logRecord, disposeReason);
///
- public void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source) { }
+ public void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery) { }
}
}
\ No newline at end of file
diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/TsavoriteLogAllocator.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/TsavoriteLogAllocator.cs
index e69b4ba783c..0fc40e4766f 100644
--- a/libs/storage/Tsavorite/cs/src/core/Allocator/TsavoriteLogAllocator.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Allocator/TsavoriteLogAllocator.cs
@@ -127,6 +127,9 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key)
///
public readonly ObjectIdMap TransientObjectIdMap => throw new NotImplementedException("Not implemented for TsavoriteLogAllocator");
+ ///
+ public readonly ObjectIdMap GetPageObjectIdMap(long pageNumber) => throw new NotImplementedException("Not implemented for TsavoriteLogAllocator");
+
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void OnDispose(ref LogRecord logRecord, DisposeReason disposeReason) => throw new NotImplementedException("Not implemented for TsavoriteLogAllocator");
@@ -136,6 +139,6 @@ public readonly RecordSizeInfo GetDeleteRecordSize(TKey key)
public void OnDisposeDiskRecord(ref DiskLogRecord logRecord, DisposeReason disposeReason) => throw new NotImplementedException("Not implemented for TsavoriteLogAllocator");
///
- public void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source) { }
+ public void EvictRecordsInRange(long startAddress, long endAddress, EvictionSource source, bool isRecovery) { }
}
}
\ No newline at end of file
diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSettings.cs b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSettings.cs
index 8dc5c53897c..533156c8c67 100644
--- a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSettings.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSettings.cs
@@ -25,6 +25,8 @@ internal class LogSettings
/// for object serialization to the object log.
public const int kMaxSegmentSizeBits = 62;
+ public const int kMinPageCount = 2;
+
/// Minimum number of bits for the size of the in-memory portion of the log
public const int kMinMemorySizeBits = kMinPageSizeBits + 1;
/// Maximum number of bits for the size of the in-memory portion of the log
diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs
index beaefe8cf1b..55ab03a5450 100644
--- a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs
@@ -23,11 +23,14 @@ public class LogSizeTracker
public static readonly int ResizeTaskDelaySeconds = 10;
/// Target size must be at least this many pages; this gives us (at least a little) room for heap allocations in a minimum of
- /// pages.
- public const int MinTargetPageCount = MinResizeTargetPageCount * 2;
+ /// pages.
+ public const int MinTargetPageCount = LogSettings.kMinPageCount * 2;
- /// When resizing we must preserve at least this many pages
- public const int MinResizeTargetPageCount = 2;
+ ///
+ /// When evicting, do not allow HeadAddress to advance to within this many bytes of TailAddress. This usually allows more than one usable record in
+ /// the database. If there are records with objects in that range that exceed the memory budget, then the memory budget should be adjusted to allow for it.
+ ///
+ public const int MinEvictionHeadAddressLag = 4096;
}
/// Tracks and controls size of log
@@ -85,11 +88,15 @@ enum RunState : int { NotStarted, Running, StopRequested, Stopped };
public override string ToString()
{
return $"{runState}; TargetSize: [{TargetSize}, hi: {highTargetSize}, lo: {lowTargetSize}]; TotalSize: [{TotalSize}, Heap: {heapSize.Total}];"
- + $" isOver: [{IsBeyondSizeLimit}, canEvict {IsBeyondSizeLimitAndCanEvict}]; AllocPgCt: {logAccessor.AllocatedPageCount}; PgSize {logAccessor.allocatorBase.PageSize}";
+ + $" isOver: [{IsOverBudget}, canEvict {IsBeyondSizeLimitAndCanEvict}]; AllocPgCt: {logAccessor.AllocatedPageCount}; PgSize {logAccessor.allocatorBase.PageSize}";
}
+ /// Returns the memory budget we have remaining
+ /// May return a negative value if already over budget.
+ public long RemainingBudget => highTargetSize - TotalSize;
+
/// Return true if the total size is outside the target plus delta
- public bool IsBeyondSizeLimit => TotalSize > highTargetSize;
+ public bool IsOverBudget => TotalSize > highTargetSize;
/// Return true if the total size is outside the target plus delta *and* we have pages we can (partially or completely) evict
/// If true, we are allocating a new page. Otherwise, we are called when adding or growing a new
@@ -105,15 +112,12 @@ public bool IsBeyondSizeLimitAndCanEvict(bool addingPage = false)
if (addingPage && numPages == logAccessor.allocatorBase.MaxAllocatedPageCount)
return true;
- // Otherwise, we need at least MinResizeTargetPageCount to be able to evict anything.
- return (TotalSize > highTargetSize) && numPages > MinResizeTargetPageCount;
+ // Otherwise, we need at least MinEvictionHeadAddressLag to be able to evict anything. Use UnstableGetTailAddress (as above): this is
+ // reached from HandlePageOverflow on the thread that owns tail-address stabilization, and the stable GetTailAddress() would spin-wait
+ // forever for a TailPageOffset that only this same thread can reset (after NeedToWaitForClose returns).
+ return (TotalSize > highTargetSize) && logAccessor.allocatorBase.UnstableGetTailAddress(out _) - logAccessor.allocatorBase.HeadAddress >= MinEvictionHeadAddressLag;
}
- /// Return true if the total size plus the size needed for the requested number of pages to read is outside the target plus delta *and*
- /// we have pages we can (partially or completely) evict
- /// This is called by Recovery.
- public bool IsBeyondSizeLimitToReadPages(int numPagesToRead) => TotalSize + (numPagesToRead * logAccessor.allocatorBase.PageSize) > highTargetSize;
-
/// Creates a new log size tracker
/// Hybrid log accessor
/// Target size for the hybrid log memory utilization
@@ -267,77 +271,107 @@ private bool DetermineEvictionRange(long currentSize, CancellationToken cancella
ref int allocatedPageCount, out long estimatedHeapTrimmedSize)
{
// We know we are oversize so we calculate how much we need to trim to get to lowTargetSize.
- var overSize = currentSize - lowTargetSize;
+ var overBudgetAmount = currentSize - lowTargetSize;
estimatedHeapTrimmedSize = 0L;
var allocator = logAccessor.allocatorBase;
headAddress = allocator.HeadAddress;
- var headPage = allocator.GetPage(headAddress);
- var untilAddress = allocator.UnstableGetTailAddress(out _);
- var untilPage = allocator.GetPage(untilAddress);
-
- // The number of pages we have is untilPage - headPage + 1.
- if (untilPage - headPage + 1 <= MinResizeTargetPageCount)
- return false;
- untilAddress = allocator.GetLogicalAddressOfStartOfPage(untilPage - MinResizeTargetPageCount + 1);
+ var startingHeadPage = allocator.GetPage(headAddress);
+ var maxEvictUntilAddress = allocator.UnstableGetTailAddress(out _) - MinEvictionHeadAddressLag;
+ var maxEvictUntilPage = allocator.GetPage(maxEvictUntilAddress);
- // If there is nothing to trim from the heap, we can just do math to advance HA.
+ // If there is nothing to trim from the heap, we just do math to trim as many pages as we need to (up to the limit).
if (heapSize.Total == 0)
{
- var evictableSize = untilAddress - headAddress;
- var isComplete = overSize <= evictableSize;
- if (!isComplete)
- overSize = evictableSize;
- headAddress = RoundUp(headAddress + overSize, Constants.kRecordAlignment);
-
- // Scan from head of page to snap headAddress to the next record boundary.
- var pageIndex = allocator.GetPage(headAddress);
- var pageStartAddress = allocator.GetLogicalAddressOfStartOfPage(pageIndex);
- var offset = headAddress - pageStartAddress;
- if (offset <= PageHeader.Size)
- headAddress = pageStartAddress;
- else
+ // We are evicting in units of pages, so we set this to the start of the maxEvictUntilPage.
+ maxEvictUntilAddress = allocator.GetLogicalAddressOfStartOfPage(maxEvictUntilPage);
+ var evictableSize = maxEvictUntilAddress - headAddress;
+
+ // evictableSize is the resident span [headAddress, tail-aligned). When heapSize is 0, TotalSize == AllocatedPageCount * PageSize, so being
+ // over budget here means AllocatedPageCount * PageSize > budget; recovery keeps AllocatedPageCount within MaxAllocatedPageCount (the read
+ // batch is capped at the budget and a final trim evicts any object-free overage), so AllocatedPageCount ~= the resident page count and that
+ // resident span must itself exceed the budget => evictableSize > 0. A negative value would mean AllocatedPageCount exceeds the resident set
+ // (stale pages left allocated below headAddress), which we must not reach.
+ Debug.Assert(evictableSize >= 0, $"evictableSize ({evictableSize}) must be non-negative; AllocatedPageCount exceeds the resident set below headAddress.");
+
+ var margin = evictableSize - overBudgetAmount;
+ var isComplete = margin > 0;
+ if (isComplete)
{
- var currentAddress = pageStartAddress + PageHeader.Size;
- var physicalAddress = allocator.GetPhysicalAddress(currentAddress);
- while (currentAddress < headAddress)
- {
- var allocatedSize = new LogRecord(physicalAddress).AllocatedSize;
- currentAddress += allocatedSize;
- physicalAddress += allocatedSize;
- }
+ // We can completely satisfy the over-budget amount, so we can add some pages back to keep more below maxEvictUntilPage.
+ var additionalPagesToKeep = margin / allocator.PageSize;
+ maxEvictUntilPage -= additionalPagesToKeep;
}
- allocatedPageCount -= (int)(allocator.GetPage(headAddress) - headPage);
+ // We'll evict the maxEvictUntilPage so start at the first valid logical address on the next page.
+ headAddress = allocator.GetFirstValidLogicalAddressOnPage(maxEvictUntilPage);
+
+ allocatedPageCount -= (int)(maxEvictUntilPage - startingHeadPage);
return isComplete;
}
- // This will iterate until iterator.CurrentAddress == untilAddress
- using var iterator = logAccessor.Scan(headAddress, untilAddress);
- allocatedPageCount = allocator.AllocatedPageCount;
+ // We have heap objects we can potentially evict. This will iterate until iterator.CurrentAddress == untilAddress.
+ // To optimize performance, iterate pages and skip the whole page if objectIdMap.IsEmpty, else enumerate records on the page.
var pageTrimmedSize = 0L;
- while (estimatedHeapTrimmedSize + pageTrimmedSize < overSize && iterator.GetNext() && !IsStopped)
+ var lastEvictPage = allocator.GetPage(maxEvictUntilAddress);
+ for (var currentPage = startingHeadPage; currentPage <= lastEvictPage && estimatedHeapTrimmedSize + pageTrimmedSize < overBudgetAmount && !IsStopped; currentPage++)
{
cancellationToken.ThrowIfCancellationRequested();
- estimatedHeapTrimmedSize += iterator.CalculateHeapMemorySize();
- // If we've crossed a page boundary, we can subtract the pagesize as well.
- var currentPage = allocator.GetPage(iterator.CurrentAddress);
- if (currentPage > headPage)
+ if (currentPage != startingHeadPage)
+ headAddress = allocator.GetFirstValidLogicalAddressOnPage(currentPage);
+
+ // If there are no objects on this page and it's below maxEvictUntilPage (which may not be able to be evicted fully),
+ // we can skip the whole page and just subtract the pagesize from the amount we need to trim.
+ if (currentPage < maxEvictUntilPage)
+ {
+ var oidMap = allocator._wrapper.GetPageObjectIdMap(currentPage);
+ if (oidMap is null || oidMap.Count == 0)
+ {
+ pageTrimmedSize += allocator.PageSize;
+ if (estimatedHeapTrimmedSize + pageTrimmedSize >= overBudgetAmount)
+ {
+ // Set headAddress to the start of the next page and we're done.
+ headAddress = allocator.GetFirstValidLogicalAddressOnPage(currentPage + 1);
+ break;
+ }
+ continue;
+ }
+ }
+
+ // We have objects, so iterate records to see where the new headAddress must be. Don't go past maxEvictUntilAddress.
+ var endAddress = allocator.GetLogicalAddressOfStartOfPage(currentPage + 1);
+ if (endAddress > maxEvictUntilAddress)
+ endAddress = maxEvictUntilAddress;
+ while (headAddress < endAddress)
+ {
+ var logRecord = allocator._wrapper.CreateLogRecord(headAddress);
+ var allocatedSize = logRecord.AllocatedSize;
+ if (allocatedSize <= 0)
+ ThrowTsavoriteException($"LogRecord size should be > 0; encountered {allocatedSize}");
+
+ headAddress += allocatedSize;
+ if (!logRecord.Info.Valid)
+ continue;
+
+ estimatedHeapTrimmedSize += logRecord.CalculateHeapMemorySize();
+ if (estimatedHeapTrimmedSize + pageTrimmedSize >= overBudgetAmount)
+ break;
+ }
+
+ // If we have finished a page, add its size to our eviction total and set headAddress to the start of the next page.
+ if (headAddress >= endAddress)
{
- headPage = currentPage;
- --allocatedPageCount;
pageTrimmedSize += allocator.PageSize;
+ headAddress = allocator.GetFirstValidLogicalAddressOnPage(currentPage + 1);
}
- }
- // iterator.NextAddress is the end of the last-processed record; if we did not advance far enough to clear all the oversize space
- // it is the start of the next record we would have processed (and probably equal to untilAddress). In both cases it is how far we
- // can evict to, and because it is the next address we've not yet evaluated whether it's crossed the page boundary; do that here.
- headAddress = iterator.NextAddress;
+ if (estimatedHeapTrimmedSize + pageTrimmedSize >= overBudgetAmount)
+ break;
+ }
- // Return whether we could satisfy the resize request; for Recovery, we may need to wait on flush.
- return estimatedHeapTrimmedSize + pageTrimmedSize >= overSize;
+ // headAddress is now properly set. Return whether we could satisfy the resize request; for Recovery, we may need to wait on flush.
+ return estimatedHeapTrimmedSize + pageTrimmedSize >= overBudgetAmount;
}
///
@@ -353,13 +387,16 @@ private void ResizeIfNeeded(CancellationToken cancellationToken)
long headAddress, estimatedHeapTrimmedSize, readOnlyAddress;
var isComplete = false;
- var allocatedPageCount = logAccessor.AllocatedPageCount;
- logger?.LogDebug("Heap size {totalLogSize} > target {highTargetSize}. Alloc: {AllocatedPageCount} BufferSize: {BufferSize}", heapSize.Total, highTargetSize, allocatedPageCount, logAccessor.BufferSize);
+ int allocatedPageCount;
// Acquire the epoch long enough to calculate eviction ranges.
logAccessor.allocatorBase.epoch.Resume();
try
{
+ // AllocatedPageCount is set here, after we've resumed the epoch (which may have done eviction).
+ allocatedPageCount = logAccessor.AllocatedPageCount;
+ logger?.LogDebug("Heap size {totalLogSize} > target {highTargetSize}. Alloc: {AllocatedPageCount} BufferSize: {BufferSize}", heapSize.Total, highTargetSize, allocatedPageCount, logAccessor.BufferSize);
+
// See how much we can evict from HeadAddress onwards. Ignore the return value that indicates whether this is complete;
// we calculate the new ROA up to MinTargetPageCount pages before TailAddress, and that's as far as we can go.
isComplete = DetermineEvictionRange(currentSize, cancellationToken, out headAddress, ref allocatedPageCount, out estimatedHeapTrimmedSize);
diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Recovery/IndexRecovery.cs b/libs/storage/Tsavorite/cs/src/core/Index/Recovery/IndexRecovery.cs
index fed7c5ab07b..f75200046f2 100644
--- a/libs/storage/Tsavorite/cs/src/core/Index/Recovery/IndexRecovery.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Index/Recovery/IndexRecovery.cs
@@ -21,16 +21,6 @@ public partial class TsavoriteBase
public ICheckpointManager CheckpointManager => checkpointManager;
// Derived class exposed API
- internal void RecoverFuzzyIndex(IndexCheckpointInfo info)
- {
- ulong alignedIndexSize = InitializeMainIndexRecovery(ref info, isAsync: false);
- overflowBucketsAllocator.Recover(info.main_ht_device, alignedIndexSize, info.info.num_buckets, info.info.num_ofb_bytes);
-
- // Wait until reading is complete
- IsFuzzyIndexRecoveryComplete(true);
- FinalizeMainIndexRecovery(info);
- }
-
internal async ValueTask RecoverFuzzyIndexAsync(IndexCheckpointInfo info, CancellationToken cancellationToken)
{
ulong alignedIndexSize = InitializeMainIndexRecovery(ref info, isAsync: true);
@@ -69,15 +59,6 @@ private void FinalizeMainIndexRecovery(IndexCheckpointInfo info)
DeleteTentativeEntries();
}
- // Test-only
- internal void RecoverFuzzyIndex(int ht_version, IDevice device, ulong num_ht_bytes, IDevice ofbdevice, int num_buckets, ulong num_ofb_bytes)
- {
- BeginMainIndexRecovery(ht_version, device, num_ht_bytes);
- var sectorSize = device.SectorSize;
- var alignedIndexSize = (num_ht_bytes + (sectorSize - 1)) & ~((ulong)sectorSize - 1);
- overflowBucketsAllocator.Recover(ofbdevice, alignedIndexSize, num_buckets, num_ofb_bytes);
- }
-
// Test-only
internal async ValueTask RecoverFuzzyIndexAsync(int ht_version, IDevice device, ulong num_ht_bytes, IDevice ofbdevice, int num_buckets, ulong num_ofb_bytes, CancellationToken cancellationToken)
{
@@ -88,13 +69,6 @@ internal async ValueTask RecoverFuzzyIndexAsync(int ht_version, IDevice device,
await overflowBucketsAllocator.RecoverAsync(ofbdevice, alignedIndexSize, num_buckets, num_ofb_bytes, cancellationToken).ConfigureAwait(false);
}
- internal bool IsFuzzyIndexRecoveryComplete(bool waitUntilComplete = false)
- {
- bool completed1 = IsMainIndexRecoveryCompleted(waitUntilComplete);
- bool completed2 = overflowBucketsAllocator.IsRecoveryCompleted(waitUntilComplete);
- return completed1 && completed2;
- }
-
///
/// Main Index Recovery Functions
///
@@ -131,17 +105,6 @@ private unsafe void BeginMainIndexRecovery(
Debug.Assert(numBytesRead == num_bytes);
}
- private bool IsMainIndexRecoveryCompleted(bool waitUntilComplete = false)
- {
- bool completed = recoveryCountdown.IsCompleted;
- if (!completed && waitUntilComplete)
- {
- recoveryCountdown.Wait();
- return true;
- }
- return completed;
- }
-
private unsafe void AsyncPageReadCallback(uint errorCode, uint numBytes, object overlap)
{
if (errorCode != 0)
diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Recovery/Recovery.cs b/libs/storage/Tsavorite/cs/src/core/Index/Recovery/Recovery.cs
index 2939f1b19cf..10e04450417 100644
--- a/libs/storage/Tsavorite/cs/src/core/Index/Recovery/Recovery.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Index/Recovery/Recovery.cs
@@ -25,6 +25,9 @@ internal sealed class RecoveryStatus
/// Object log recovery device, obtained from CheckpointManager.
public IDevice objectLogRecoveryDevice;
+ /// The current head address; updated as pages are evicted during recovery.
+ public long headAddress;
+
/// Circular status buffer of 'capacity' size; the indexing wraps per hlog.GetPageIndexForPage().
public ReadStatus[] readStatus;
/// Circular status buffer of 'capacity' size; the indexing wraps per hlog.GetPageIndexForPage().
@@ -65,7 +68,7 @@ internal void WaitRead(int pageIndex)
while (readStatus[pageIndex] == ReadStatus.Pending)
readSemaphore.Wait();
if (readStatus[pageIndex] == ReadStatus.Error)
- throw new TsavoriteException($"Error reading page {pageIndex} from device");
+ ThrowTsavoriteException($"Error reading page {pageIndex} from device");
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -74,7 +77,7 @@ internal async ValueTask WaitReadAsync(int pageIndex, CancellationToken cancella
while (readStatus[pageIndex] == ReadStatus.Pending)
await readSemaphore.WaitAsync(cancellationToken).ConfigureAwait(false);
if (readStatus[pageIndex] == ReadStatus.Error)
- throw new TsavoriteException($"Error reading page {pageIndex} from device");
+ ThrowTsavoriteException($"Error reading page {pageIndex} from device");
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -117,15 +120,13 @@ internal void Dispose()
}
}
- internal struct RecoveryOptions
+ internal readonly struct RecoveryOptions
{
- internal long headAddress;
- internal long fuzzyRegionStartAddress;
- internal bool undoNextVersion;
+ internal readonly long fuzzyRegionStartAddress;
+ internal readonly bool undoNextVersion;
- internal RecoveryOptions(long headAddress, long fuzzyRegionStartAddress, bool undoNextVersion)
+ internal RecoveryOptions(long fuzzyRegionStartAddress, bool undoNextVersion)
{
- this.headAddress = headAddress;
this.fuzzyRegionStartAddress = fuzzyRegionStartAddress;
this.undoNextVersion = undoNextVersion;
}
@@ -162,8 +163,6 @@ public partial class TsavoriteKV : TsavoriteBase
where TStoreFunctions : IStoreFunctions
where TAllocator : IAllocator
{
- private const long NoPageFreed = -1;
-
///
/// GetLatestCheckpointTokens
///
@@ -267,11 +266,7 @@ public long GetIndexFileSize(Guid token)
return (long)(recoveredICInfo.info.num_ht_bytes + recoveredICInfo.info.num_ofb_bytes);
}
- private void GetClosestHybridLogCheckpointInfo(
- long requestedVersion,
- out Guid closestToken,
- out HybridLogCheckpointInfo closest,
- out byte[] cookie)
+ private void GetClosestHybridLogCheckpointInfo(long requestedVersion, out Guid closestToken, out HybridLogCheckpointInfo closest, out byte[] cookie)
{
HybridLogCheckpointInfo current;
var closestVersion = long.MaxValue;
@@ -289,11 +284,13 @@ private void GetClosestHybridLogCheckpointInfo(
current = new HybridLogCheckpointInfo();
current.Recover(hybridLogToken, checkpointManager, out var currCookie);
var distanceToTarget = (requestedVersion == -1 ? long.MaxValue : requestedVersion) - current.info.version;
+
// This is larger than intended version, cannot recover to this.
- if (distanceToTarget < 0) continue;
- // We have found the exact version to recover to --- the above conditional establishes that the
- // checkpointed version is <= requested version, and if next version is larger than requestedVersion,
- // there cannot be any closer version.
+ if (distanceToTarget < 0)
+ continue;
+
+ // We have found the exact version to recover to: the above conditional establishes that the checkpointed version is <= requested version,
+ // and if nextVersion is larger than requestedVersion, there cannot be any closer version.
if (current.info.nextVersion > requestedVersion)
{
closest = current;
@@ -421,67 +418,6 @@ public void Reset()
lastVersion = 0;
}
- /// Synchronous recovery driver
- private long InternalRecover(Guid indexToken, Guid hybridLogToken, int numPagesToPreload, bool undoNextVersion)
- {
- GetRecoveryInfo(indexToken, hybridLogToken, out var recoveredHLCInfo, out var recoveredICInfo);
- return InternalRecover(recoveredICInfo, recoveredHLCInfo, numPagesToPreload, undoNextVersion);
- }
-
- /// Synchronous recovery driver
- private long InternalRecover(IndexCheckpointInfo recoveredICInfo, HybridLogCheckpointInfo recoveredHLCInfo, int numPagesToPreload, bool undoNextVersion)
- {
- hlogBase.VerifyRecoveryInfo(recoveredHLCInfo, false);
-
- if (hlogBase.GetTailAddress() > hlogBase.GetFirstValidLogicalAddressOnPage(0))
- {
- logger?.LogInformation("Recovery called on non-empty log - resetting to empty state first. Make sure store is quiesced before calling Recover on a running store.");
- Reset();
- }
-
- if (!GetInitialRecoveryAddress(recoveredICInfo, recoveredHLCInfo, out long recoverFromAddress))
- RecoverFuzzyIndex(recoveredICInfo);
-
- if (!SetRecoveryPageRanges(recoveredHLCInfo, numPagesToPreload, recoverFromAddress, out long tailAddress, out long headAddress, out long scanFromAddress))
- return -1;
- RecoveryOptions options = new(headAddress, fuzzyRegionStartAddress: recoveredHLCInfo.info.startLogicalAddress, undoNextVersion);
-
- // Make index consistent for version v
- long readOnlyAddress, lastFreedPage;
- if (recoveredHLCInfo.info.useSnapshotFile == 0)
- {
- lastFreedPage = RecoverHybridLog(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress,
- recoveredHLCInfo.info.nextVersion, CheckpointType.FoldOver, options);
-
- readOnlyAddress = tailAddress;
- }
- else
- {
- if (recoveredHLCInfo.info.flushedLogicalAddress < headAddress)
- headAddress = recoveredHLCInfo.info.flushedLogicalAddress;
-
- // First recover from index starting point (fromAddress) to snapshot starting point (flushedLogicalAddress taken at PERSISTENCE_CALLBACK, so it includes
- // any flushes to the hybrid log files due to OnPagesMarkedReadOnly while we were flushing to the snapshot files).
- lastFreedPage = RecoverHybridLog(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.flushedLogicalAddress,
- recoveredHLCInfo.info.nextVersion, CheckpointType.Snapshot, options);
-
- // Then recover snapshot into mutable region. Note that the ObjectAllocator will not write object log records for the mutable region;
- // that only happens during flushes due to OnPagesMarkedReadOnly.
- var snapshotLastFreedPage = RecoverHybridLogFromSnapshotFile(scanFromAddress: recoveredHLCInfo.info.flushedLogicalAddress,
- recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress,
- snapshotStartAddress: recoveredHLCInfo.info.snapshotStartFlushedLogicalAddress, snapshotEndAddress: recoveredHLCInfo.info.snapshotFinalLogicalAddress,
- recoveredHLCInfo.info.nextVersion, recoveredHLCInfo.info.guid, options);
-
- if (snapshotLastFreedPage != NoPageFreed)
- lastFreedPage = snapshotLastFreedPage;
-
- readOnlyAddress = recoveredHLCInfo.info.flushedLogicalAddress;
- }
-
- DoPostRecovery(recoveredICInfo, recoveredHLCInfo, tailAddress, ref headAddress, ref readOnlyAddress, lastFreedPage);
- return recoveredHLCInfo.info.version;
- }
-
/// Aynchronous recovery driver
private ValueTask InternalRecoverAsync(Guid indexToken, Guid hybridLogToken, int numPagesToPreload, bool undoNextVersion, CancellationToken cancellationToken)
{
@@ -505,15 +441,20 @@ private async ValueTask InternalRecoverAsync(IndexCheckpointInfo recovered
if (!SetRecoveryPageRanges(recoveredHLCInfo, numPagesToPreload, recoverFromAddress, out long tailAddress, out long headAddress, out long scanFromAddress))
return -1;
- RecoveryOptions options = new(headAddress, fuzzyRegionStartAddress: recoveredHLCInfo.info.startLogicalAddress, undoNextVersion);
+ RecoveryOptions options = new(fuzzyRegionStartAddress: recoveredHLCInfo.info.startLogicalAddress, undoNextVersion);
// Make index consistent for version v
- long readOnlyAddress, lastFreedPage;
+ long readOnlyAddress;
+ long finalHeadAddress;
+ RecoveryStatus recoveryStatus;
if (recoveredHLCInfo.info.useSnapshotFile == 0)
{
- lastFreedPage = await RecoverHybridLogAsync(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress,
- recoveredHLCInfo.info.nextVersion, CheckpointType.FoldOver, options, cancellationToken).ConfigureAwait(false);
+ recoveryStatus = await RecoverHybridLogAsync(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress,
+ recoveredHLCInfo.info.nextVersion, CheckpointType.FoldOver, headAddress, options, cancellationToken).ConfigureAwait(false);
+ // FoldOver objects are already durable in the main object-log; set the tail to its end so subsequent writes append after it.
+ hlogBase.SetObjectLogTail(recoveredHLCInfo.info.hlogEndObjectLogTail);
+ finalHeadAddress = recoveryStatus.headAddress;
readOnlyAddress = tailAddress;
}
else
@@ -522,48 +463,40 @@ private async ValueTask InternalRecoverAsync(IndexCheckpointInfo recovered
headAddress = recoveredHLCInfo.info.flushedLogicalAddress;
// First recover from index starting point (fromAddress) to snapshot starting point (flushedLogicalAddress taken at PERSISTENCE_CALLBACK, so it includes
- // any flushes to the hybrid log files due to OnPagesMarkedReadOnly while we were flushing to the snapshot files).
- lastFreedPage = await RecoverHybridLogAsync(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.flushedLogicalAddress,
- recoveredHLCInfo.info.nextVersion, CheckpointType.Snapshot,
- new RecoveryOptions(headAddress, fuzzyRegionStartAddress: recoveredHLCInfo.info.startLogicalAddress, undoNextVersion), cancellationToken).ConfigureAwait(false);
-
- // Then recover snapshot into mutable region. Note that the ObjectAllocator will not write object log records for the mutable region;
- // that only happens during flushes due to OnPagesMarkedReadOnly.
- var snapshotLastFreedPage = await RecoverHybridLogFromSnapshotFileAsync(scanFromAddress: recoveredHLCInfo.info.flushedLogicalAddress,
+ // any flushes to the hybrid log files due to OnPagesMarkedReadOnly while we were flushing to the snapshot files). Object loading is deferred (see below).
+ recoveryStatus = await RecoverHybridLogAsync(scanFromAddress, recoverFromAddress, untilAddress: recoveredHLCInfo.info.flushedLogicalAddress,
+ recoveredHLCInfo.info.nextVersion, CheckpointType.Snapshot, headAddress, options, cancellationToken).ConfigureAwait(false);
+
+ // Initialize the main object-log tail to the end of the hybrid-log objects BEFORE recovering the snapshot pages: the snapshot-region flushes copy
+ // each record's objects from the snapshot object-log into the main object-log starting here, advancing the tail (via OnPartialFlushComplete) as they go.
+ // This must happen after the hybrid-log phase (which runs with the tail unset, like before) and before the snapshot phase.
+ hlogBase.SetObjectLogTail(recoveredHLCInfo.info.hlogEndObjectLogTail);
+
+ // Then recover snapshot into mutable region. The snapshot-region pages are read (without their objects), flushed to the main log with their objects
+ // copied into the main object-log (so they are durable and can be evicted into a smaller memory budget), and then objects are loaded once over the full
+ // recovered range (both the hybrid-log and snapshot regions), honoring the final headAddress.
+ finalHeadAddress = await RecoverHybridLogFromSnapshotFileAsync(scanFromAddress: recoveredHLCInfo.info.flushedLogicalAddress,
recoverFromAddress, untilAddress: recoveredHLCInfo.info.finalLogicalAddress,
snapshotStartAddress: recoveredHLCInfo.info.snapshotStartFlushedLogicalAddress, snapshotEndAddress: recoveredHLCInfo.info.snapshotFinalLogicalAddress,
- recoveredHLCInfo.info.nextVersion, recoveredHLCInfo.info.guid, options, cancellationToken).ConfigureAwait(false);
-
- if (snapshotLastFreedPage != NoPageFreed)
- lastFreedPage = snapshotLastFreedPage;
+ recoveredHLCInfo.info.nextVersion, recoveredHLCInfo.info.guid, headAddress: recoveryStatus.headAddress,
+ options, cancellationToken).ConfigureAwait(false);
readOnlyAddress = recoveredHLCInfo.info.flushedLogicalAddress;
}
- DoPostRecovery(recoveredICInfo, recoveredHLCInfo, tailAddress, ref headAddress, ref readOnlyAddress, lastFreedPage);
+ DoPostRecovery(recoveredICInfo, recoveredHLCInfo, tailAddress, finalHeadAddress, readOnlyAddress);
return recoveredHLCInfo.info.version;
}
- private void DoPostRecovery(IndexCheckpointInfo recoveredICInfo, HybridLogCheckpointInfo recoveredHLCInfo, long tailAddress, ref long headAddress, ref long readOnlyAddress, long lastFreedPage)
+ private void DoPostRecovery(IndexCheckpointInfo recoveredICInfo, HybridLogCheckpointInfo recoveredHLCInfo, long tailAddress, long headAddress, long readOnlyAddress)
{
- // Adjust head and read-only address post-recovery
- var _head = hlogBase.GetFirstValidLogicalAddressOnPage(1 + hlogBase.GetPage(tailAddress) - hlogBase.MaxAllocatedPageCount);
-
- // If additional pages have been freed to accommodate memory constraints, adjust head address accordingly
- if (lastFreedPage != NoPageFreed)
- {
- var nextAddress = hlogBase.GetFirstValidLogicalAddressOnPage(lastFreedPage + 1);
- if (_head < nextAddress)
- _head = nextAddress;
- }
-
- if (_head > headAddress)
- headAddress = _head;
+ // HeadAddress has already been adjusted for any evictions but make sure we are below any existing HeadAddress in the log.
+ if (headAddress < hlogBase.HeadAddress)
+ headAddress = hlogBase.HeadAddress;
if (readOnlyAddress < headAddress)
readOnlyAddress = headAddress;
hlogBase.RecoveryReset(tailAddress, headAddress, recoveredHLCInfo.info.beginAddress, readOnlyAddress);
- hlogBase.SetObjectLogTail(recoveredHLCInfo.info.hlogEndObjectLogTail);
checkpointManager.OnRecovery(recoveredICInfo.info.token, recoveredHLCInfo.info.guid);
recoveredHLCInfo.Dispose();
}
@@ -573,10 +506,7 @@ private void DoPostRecovery(IndexCheckpointInfo recoveredICInfo, HybridLogCheckp
/// Warning: use only when the system is not taking a checkpoint.
///
/// Version to set the store to
- public void SetVersion(long version)
- {
- stateMachineDriver.SetSystemState(SystemState.Make(Phase.REST, version));
- }
+ public void SetVersion(long version) => stateMachineDriver.SetSystemState(SystemState.Make(Phase.REST, version));
///
/// Compute recovery address and determine where to recover from
@@ -657,10 +587,10 @@ private bool SetRecoveryPageRanges(HybridLogCheckpointInfo recoveredHLCInfo, int
return true;
}
- private long ReadPagesWithMemoryConstraint(long endAddress, RecoveryStatus recoveryStatus, long page, long endPage, int numPagesToRead)
+ private void ReadPagesWithMemoryConstraint(long endAddress, RecoveryStatus recoveryStatus, long page, long endPage, int numPagesToRead)
{
- // Before reading in additional pages, trim memory if needed to make room for the inline space (we can't know the heap size yet)
- var freedPage = TrimLogMemorySize(recoveryStatus, tailPage: page, numPagesToRead);
+ // Before reading in additional pages, trim memory if needed to make room for the inline page space.
+ TrimLogPages(recoveryStatus, tailPage: page, numPagesToRead, untilAddress: endAddress);
// Set all page read statuses to Pending
for (var p = page; p < endPage; p++)
@@ -668,130 +598,93 @@ private long ReadPagesWithMemoryConstraint(long endAddress, RecoveryStatus recov
// Issue request to read pages as much as possible
hlogBase.AsyncReadPagesForRecovery(page, numPagesToRead, endAddress, recoveryStatus, recoveryStatus.recoveryDevicePageOffset,
- recoveryStatus.recoveryDevice, recoveryStatus.objectLogRecoveryDevice);
- return freedPage;
+ recoveryStatus.recoveryDevice, recoveryStatus.objectLogRecoveryDevice, RecoveryPhase.Pass1);
}
- ///
- /// Called before 'pagesToRead' number of pages are read into memory, this method determines how many previously allocated pages
- /// must be (partially or completely) freed to avoid the total memory size to go beyond the specified maximum during recovery.
- ///
- /// True if is nonzero, else false
- private bool GetEvictionPageRange(long tailPage, int numPagesToRead, CancellationToken cancellationToken, out long startPage, out int minEvictPageCount, out int maxEvictPageCount)
+ private void TrimLogPages(RecoveryStatus recoveryStatus, long tailPage, int numPagesToRead, long untilAddress)
{
- // The caller will iterate from startPage to endPage, so we use that as the basis for our eviction counts (which will start evicting at startPage).
- // tailPage is the leading page index and start/endPage are the trailing page indexes: startPage is at the start of a full buffer of pages,
- // and endPage is the start of the "usable" buffer capacity (the amount of pages we can actually use within the hlogBase.MaxAllocatedPageCount
- // constraint) PLUS the number of pages to read. If hlogBase.MaxAllocatedPageCount is less than hlogBase.BufferSize, the the calling
- // TrimLogMemorySize will probably be iterating over freed (non-allocated) pages from startPage to (endPage - numPagesToRead), and then
- // will start actually evicting pages. NOTE: Currently numPagesToRead is always 1, but we may be able to optimize that in the future.
- startPage = Math.Max(0, tailPage - hlogBase.BufferSize);
- var endPage = Math.Max(0, tailPage - hlogBase.MaxAllocatedPageCount + numPagesToRead);
-
- // TODO: Currently Recovery is still page-level eviction only. hlogBase.HeadAddress etc. are not yet set so we will have to propagate
- // the new headAddress back up the path we currently pass the lastFreedPage.
-
- // MinEvictPageCount is the number of pages we must clear so we can read numPagesToRead without violating the maximum page count constraint.
- minEvictPageCount = Math.Max(0, (int)(endPage - startPage));
- maxEvictPageCount = minEvictPageCount;
- if (endPage <= startPage)
- return false;
-
- // If no log size tracker, just ensure MaxPageCount is not exceeded.
if (hlogBase.logSizeTracker is null)
- return minEvictPageCount > 0;
+ return;
- // We have a log size tracker, so set minEvictPageCount to zero and maxEvictPageCount to the maximum number of pages we can evict;
- // the caller will also test logSizeTracker.IsBeyondSizeLimitToReadPages during the eviction loop and jump out if it drops within budget.
- maxEvictPageCount = Math.Max(minEvictPageCount, (int)(tailPage - startPage) - LogSizeTracker.MinResizeTargetPageCount);
- return minEvictPageCount > 0 || hlogBase.logSizeTracker.IsBeyondSizeLimitToReadPages(numPagesToRead);
- }
+ var headPage = hlogBase.GetPage(recoveryStatus.headAddress);
+ var loadedPages = tailPage - headPage + 1;
+ var totalPagesNeeded = loadedPages + numPagesToRead;
- private long TrimLogMemorySize(RecoveryStatus recoveryStatus, long tailPage, int numPagesToRead)
- {
- var lastFreedPage = NoPageFreed;
- if (GetEvictionPageRange(tailPage, numPagesToRead, cancellationToken: default, out long startPage, out int minEvictPageCount, out int maxEvictPageCount))
+ // Respect the usual MinEvictionHeadAddressLag tail lag. Snapshot pages are made durable (objects copied to the main object-log) by
+ // RecoverSnapshotPages before they can be evicted here, so read-time eviction is free to evict any page to honor the memory budget.
+ var maxHeadAddress = untilAddress - LogSizeTracker.MinEvictionHeadAddressLag;
+
+ // Evict pages from headAddress upward while over budget, respecting MinEvictionHeadAddressLag. This is during Pass1,
+ // so there are no objects to evict; we're evicting a full page each iteration.
+ while (totalPagesNeeded > 1
+ && hlogBase.logSizeTracker.RemainingBudget < numPagesToRead * hlogBase.PageSize
+ && recoveryStatus.headAddress < maxHeadAddress)
{
- // Evict pages one at a time
- for (var ii = 0; ii < maxEvictPageCount; ii++)
+ var pageIndex = hlogBase.GetPageIndexForPage(headPage);
+ if (hlogBase.IsAllocated(pageIndex))
{
- if (hlogBase.logSizeTracker is not null && ii >= minEvictPageCount && !hlogBase.logSizeTracker.IsBeyondSizeLimitToReadPages(numPagesToRead))
- break;
- var page = startPage + ii;
- var pageIndex = hlogBase.GetPageIndexForPage(page);
- if (hlogBase.IsAllocated(pageIndex))
- {
- recoveryStatus.WaitFlush(pageIndex);
- hlogBase.EvictPageForRecovery(page);
- lastFreedPage = page;
- }
+ recoveryStatus.WaitFlush(pageIndex);
+ hlogBase.EvictPageForRecovery(headPage);
}
+ headPage++;
+ recoveryStatus.headAddress = hlogBase.GetFirstValidLogicalAddressOnPage(headPage);
+ if (recoveryStatus.headAddress > maxHeadAddress)
+ {
+ recoveryStatus.headAddress = maxHeadAddress;
+ break;
+ }
+ totalPagesNeeded--;
}
-
- return lastFreedPage;
}
- private async Task TrimLogMemorySizeAsync(RecoveryStatus recoveryStatus, long tailPage, int numPagesToRead, CancellationToken cancellationToken = default)
+ ///
+ /// After the recovery read loop and deferred object load, evict object-free resident pages from headAddress upward until AllocatedPageCount is within
+ /// , respecting .
+ /// The per-batch reserves room for each upcoming read against the delta-padded highTargetSize budget and does not run after
+ /// the final batch, so an object-free (inline) store can settle one page above the hard MaxAllocatedPageCount cap. Object-free pages are durable on the
+ /// main log (re-read on demand); the walk stops at the first page with live objects, whose budget is governed by 's
+ /// heap-aware eviction. Dead pages below startPage (from store initialization) are freed up front in RecoverHybridLogAsync, so AllocatedPageCount here
+ /// reflects only resident data and this budget walk is accurate.
+ ///
+ private void TrimResidentPagesToBudget(RecoveryStatus recoveryStatus, long untilAddress)
{
- var lastFreedPage = NoPageFreed;
- if (GetEvictionPageRange(tailPage, numPagesToRead, cancellationToken: default, out long startPage, out int minEvictPageCount, out int maxEvictPageCount))
+ if (hlogBase.logSizeTracker is null)
+ return;
+
+ var maxHeadAddress = untilAddress - LogSizeTracker.MinEvictionHeadAddressLag;
+ while (hlogBase.AllocatedPageCount > hlogBase.MaxAllocatedPageCount && recoveryStatus.headAddress < maxHeadAddress)
{
- // Evict pages one at a time
- for (var ii = 0; ii < maxEvictPageCount; ii++)
+ var hp = hlogBase.GetPage(recoveryStatus.headAddress);
+ if (hlogBase.IsAllocated(hlogBase.GetPageIndexForPage(hp)))
{
- if (hlogBase.logSizeTracker is not null && ii >= minEvictPageCount && !hlogBase.logSizeTracker.IsBeyondSizeLimitToReadPages(numPagesToRead))
+ var objectIdMap = hlogBase._wrapper.GetPageObjectIdMap(hp);
+ if (objectIdMap is not null && objectIdMap.Count > 0)
break;
- var page = startPage + ii;
- var pageIndex = hlogBase.GetPageIndexForPage(page);
- if (hlogBase.IsAllocated(pageIndex))
- {
- await recoveryStatus.WaitFlushAsync(pageIndex, cancellationToken).ConfigureAwait(false);
- hlogBase.EvictPageForRecovery(page);
- lastFreedPage = page;
- }
+ hlogBase.EvictPageForRecovery(hp);
}
+ recoveryStatus.headAddress = hlogBase.GetFirstValidLogicalAddressOnPage(hp + 1);
}
-
- return lastFreedPage;
}
- private (long end, long freedPage) ReadPagesForRecovery(long untilAddress, RecoveryStatus recoveryStatus, long endPage, int numPagesToReadPerIteration, long page)
+ private async ValueTask ReadPagesForRecoveryAsync(long untilAddress, RecoveryStatus recoveryStatus, long endPage, int numPagesToReadPerIteration, long page, CancellationToken cancellationToken)
{
var readEndPage = Math.Min(page + numPagesToReadPerIteration, endPage);
if (page < readEndPage)
{
var numPagesToRead = (int)(readEndPage - page);
- // Ensure that page slots that will be read into, have been flushed from previous reads. Due to the use of a single read semaphore,
- // this must be done in batches of "all flushes' followed by "all reads" to ensure proper sequencing of reads when
- // we are not using the full BufferSize (and thus the page-read index is not equal to the page-flush index).
- WaitUntilAllPagesHaveBeenFlushed(page, readEndPage, recoveryStatus);
- return (readEndPage, ReadPagesWithMemoryConstraint(untilAddress, recoveryStatus, page, readEndPage, numPagesToRead));
- }
-
- return (readEndPage, NoPageFreed);
- }
-
- private async ValueTask<(long end, long freedPage)> ReadPagesForRecoveryAsync(long untilAddress, RecoveryStatus recoveryStatus, long endPage, int numPagesToReadPerIteration, long page, CancellationToken cancellationToken)
- {
- var readEndPage = Math.Min(page + numPagesToReadPerIteration, endPage);
- if (page < readEndPage)
- {
- var numPagesToRead = (int)(readEndPage - page);
-
- // Ensure that page slots that will be read into, have been flushed from previous reads. Due to the use of a single read semaphore,
- // this must be done in batches of "all flushes' followed by "all reads" to ensure proper sequencing of reads when
+ // Ensure that page slots that will be read into have been flushed from previous reads. Due to the use of a single read semaphore,
+ // this must be done in batches of all flushes followed by all reads to ensure proper sequencing of reads when
// usableCapacity != capacity (and thus the page-read index is not equal to the page-flush index).
await WaitUntilAllPagesHaveBeenFlushedAsync(page, readEndPage, recoveryStatus, cancellationToken).ConfigureAwait(false);
- return (readEndPage, ReadPagesWithMemoryConstraint(untilAddress, recoveryStatus, page, readEndPage, numPagesToRead));
+ ReadPagesWithMemoryConstraint(untilAddress, recoveryStatus, page, readEndPage, numPagesToRead);
}
- return (readEndPage, NoPageFreed);
+ return readEndPage;
}
///
- /// Synchronously recover the hybrid log from hybrid log files (not snapshot files). This also deserializes any objects or overflow and creates
- /// entries for them in the .
+ /// Asynchronously recover the hybrid log from hybrid log files (not snapshot files).
///
/// The address to start scanning from; the lowest address at which we will bring pages into the circular buffer (may be in the middle of a page)
/// The address from which to perform recovery (undo v+1 records and append to tag-chain tail)
@@ -799,102 +692,52 @@ private async Task TrimLogMemorySizeAsync(RecoveryStatus recoveryStatus, l
/// The next version of the database at the time of checkpoint flush
/// The type of checkpoint
/// The recovery options
- /// The last freed page, if it was necessary to free any to limit heap memory
- private long RecoverHybridLog(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, CheckpointType checkpointType, RecoveryOptions options)
+ /// The cancellation token
+ private async ValueTask RecoverHybridLogAsync(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion,
+ CheckpointType checkpointType, long headAddress, RecoveryOptions options, CancellationToken cancellationToken)
{
- long lastFreedPage = NoPageFreed;
- if (untilAddress <= scanFromAddress)
- return lastFreedPage;
-
var recoveryStatus = GetPageRangesToRead(scanFromAddress, untilAddress, checkpointType, out long startPage, out long endPage, out int numPagesToReadPerIteration);
+ recoveryStatus.headAddress = headAddress;
- Debug.Assert(hlogBase.logSizeTracker is null || numPagesToReadPerIteration == 1, "numPagesToReadPerIteration must be 1 when tracking sizes");
- for (var page = startPage; page < endPage; page += numPagesToReadPerIteration)
+ // Free any pages still allocated below startPage before reading. The store is freshly constructed for recovery with the allocator's minimum
+ // pages allocated at page 0 (Head=Begin=Tail=0); when the checkpoint's BeginAddress is above page 0 those low pages lie below the first page we
+ // read (startPage), and the upward-only read/eviction never reaches them. Freeing them up front keeps them out of AllocatedPageCount for the
+ // whole budget-checked recovery, instead of carrying the dead pages through every budget check and reclaiming them at the end.
+ for (var deadPage = 0L; deadPage < startPage && deadPage < hlogBase.BufferSize; deadPage++)
{
- var (end, freedPage) = ReadPagesForRecovery(untilAddress, recoveryStatus, endPage, numPagesToReadPerIteration, page);
- if (freedPage != NoPageFreed)
- lastFreedPage = freedPage;
-
- var trimPageReadCount = numPagesToReadPerIteration;
- for (var p = page; p < end; p++)
- {
- // Ensure page has been read into memory
- int pageIndex = hlogBase.GetPageIndexForPage(p);
- recoveryStatus.WaitRead(pageIndex);
-
- if (hlogBase.logSizeTracker is not null)
- {
- // Trim the log memory again in case we read large objects on the current page. Add 1 to tailPage so that
- // when the BufferSize subtraction wraps around the buffer it won't try to evict the page we just added.
- // Decrease trimPageReadCount as we process each page so we don't over-prune.
- freedPage = TrimLogMemorySize(recoveryStatus, tailPage: p + 1, trimPageReadCount--);
- if (freedPage != NoPageFreed)
- lastFreedPage = freedPage;
- }
-
- // We make an extra pass to clear locks when reading every page back into memory
- ClearBitsOnPage(p, untilAddress, options);
- ProcessReadPageAndFlush(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, options, recoveryStatus, p, pageIndex);
- }
+ if (hlogBase.IsAllocated(hlogBase.GetPageIndexForPage(deadPage)))
+ hlogBase.EvictPageForRecovery(deadPage);
}
- WaitUntilAllPagesHaveBeenFlushed(startPage, endPage, recoveryStatus);
- return lastFreedPage;
- }
-
- ///
- /// Synchronously recover the hybrid log from hybrid log files (not snapshot files). This also deserializes any objects or overflow and creates
- /// entries for them in the .
- ///
- /// The address to start scanning from; the lowest address at which we will bring pages into the circular buffer (may be in the middle of a page)
- /// The address from which to perform recovery (undo v+1 records and append to tag-chain tail)
- /// The last address to scan; this is initially the tailAddress at the time of checkpoint flush,
- /// The next version of the database at the time of checkpoint flush
- /// The type of checkpoint
- /// The recovery options
- /// The cancellation token
- /// The last freed page, if it was necessary to free any to limit heap memory
- private async ValueTask RecoverHybridLogAsync(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion,
- CheckpointType checkpointType, RecoveryOptions options, CancellationToken cancellationToken)
- {
- long lastFreedPage = NoPageFreed;
if (untilAddress <= scanFromAddress)
- return lastFreedPage;
+ return recoveryStatus;
- var recoveryStatus = GetPageRangesToRead(scanFromAddress, untilAddress, checkpointType, out long startPage, out long endPage, out int numPagesToReadPerIteration);
-
- Debug.Assert(hlogBase.logSizeTracker is null || numPagesToReadPerIteration == 1, "numPagesToReadPerIteration must be 1 when tracking sizes");
for (long page = startPage; page < endPage; page += numPagesToReadPerIteration)
{
- var (end, freedPage) = await ReadPagesForRecoveryAsync(untilAddress, recoveryStatus, endPage, numPagesToReadPerIteration, page, cancellationToken).ConfigureAwait(false);
- if (freedPage != NoPageFreed)
- lastFreedPage = freedPage;
-
- var trimPageReadCount = numPagesToReadPerIteration;
+ var end = await ReadPagesForRecoveryAsync(untilAddress, recoveryStatus, endPage, numPagesToReadPerIteration, page, cancellationToken).ConfigureAwait(false);
for (var p = page; p < end; p++)
{
// Ensure page has been read into memory
var pageIndex = hlogBase.GetPageIndexForPage(p);
await recoveryStatus.WaitReadAsync(pageIndex, cancellationToken).ConfigureAwait(false);
- if (hlogBase.logSizeTracker is not null)
- {
- // Trim the log memory again in case we read large objects on the current page. Add 1 to tailPage so that
- // when the BufferSize subtraction wraps around the buffer it won't try to evict the page we just added.
- // Decrease trimPageReadCount as we process each page so we don't over-prune.
- freedPage = await TrimLogMemorySizeAsync(recoveryStatus, tailPage: p + 1, trimPageReadCount--, cancellationToken).ConfigureAwait(false);
- if (freedPage != NoPageFreed)
- lastFreedPage = freedPage;
- }
-
// We make an extra pass to clear locks when reading every page back into memory
- ClearBitsOnPage(p, untilAddress, options);
- ProcessReadPageAndFlush(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, options, recoveryStatus, p, pageIndex);
+ ClearBitsOnPage(p, untilAddress, in options, recoveryStatus.headAddress);
+ ProcessReadPageAndFlush(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, in options, recoveryStatus, p, pageIndex);
}
}
await WaitUntilAllPagesHaveBeenFlushedAsync(startPage, endPage, recoveryStatus, cancellationToken).ConfigureAwait(false);
- return lastFreedPage;
+
+ // Defer object loading when this is the hybrid-log phase of a snapshot recovery; RecoverHybridLogFromSnapshotFileAsync
+ // loads the objects once after the snapshot pages have also been read (without their objects), so the final headAddress
+ // (after eviction over the full recovered range) is honored. For FoldOver there is no following snapshot phase.
+ if (checkpointType != CheckpointType.Snapshot)
+ {
+ RecoveryLoadObjectsPass2(recoveryStatus, recoveryStatus.headAddress, untilAddress, objectLogDevice: null);
+ TrimResidentPagesToBudget(recoveryStatus, untilAddress);
+ }
+ return recoveryStatus;
}
///
@@ -915,9 +758,16 @@ private RecoveryStatus GetPageRangesToRead(long scanFromAddress, long untilAddre
if (untilAddress > hlogBase.GetFirstValidLogicalAddressOnPage(endPage) && untilAddress > scanFromAddress)
endPage++;
- // If heap memory is to be tracked, then read one page at a time to control memory usage
- var totalPagesToRead = (int)(endPage - startPage);
- numPagesToReadPerIteration = hlogBase.logSizeTracker is null ? Math.Min(hlogBase.BufferSize, totalPagesToRead) : 1;
+ // Read as many pages as buffer allows, leaving room for at least 1 page for eviction.
+ numPagesToReadPerIteration = Math.Min(hlogBase.BufferSize - 1, (int)(endPage - startPage));
+
+ // Never read more pages per batch than the memory budget allows. BufferSize can exceed MaxAllocatedPageCount when the budget is not a
+ // power-of-two page count (e.g. a 23k budget => MaxAllocatedPageCount 5, BufferSize 8); reading a full BufferSize-1 batch would fill the
+ // circular buffer above MaxAllocatedPageCount, leaving over-budget pages resident that read-time eviction (TrimLogPages) cannot reclaim
+ // because they were read below the eviction floor (untilAddress - MinEvictionHeadAddressLag). MaxAllocatedPageCount is the allocator's hard
+ // cap on AllocatedPageCount, so honoring it here keeps recovery within budget at every step (modulo the MinEvictionHeadAddressLag tail).
+ if (hlogBase.logSizeTracker is not null && hlogBase.MaxAllocatedPageCount < numPagesToReadPerIteration)
+ numPagesToReadPerIteration = hlogBase.MaxAllocatedPageCount;
return new RecoveryStatus(hlogBase.BufferSize);
}
@@ -932,7 +782,7 @@ private RecoveryStatus GetPageRangesToRead(long scanFromAddress, long untilAddre
/// The instance
/// The page number to process
/// The index of in the allocator's circular page buffer
- private void ProcessReadPageAndFlush(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, RecoveryOptions options,
+ private void ProcessReadPageAndFlush(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, in RecoveryOptions options,
RecoveryStatus recoveryStatus, long page, int pageIndex)
{
if (ProcessReadPage(recoverFromAddress, untilAddress, nextVersion, options, recoveryStatus, page, pageIndex))
@@ -957,7 +807,7 @@ private void ProcessReadPageAndFlush(long scanFromAddress, long recoverFromAddre
/// The page number to process
/// The index of in the allocator's circular page buffer
///
- private bool ProcessReadPage(long recoverFromAddress, long untilAddress, long nextVersion, RecoveryOptions options, RecoveryStatus recoveryStatus,
+ private bool ProcessReadPage(long recoverFromAddress, long untilAddress, long nextVersion, in RecoveryOptions options, RecoveryStatus recoveryStatus,
long page, int pageIndex)
{
var startLogicalAddressOfPage = hlogBase.GetLogicalAddressOfStartOfPage(page); // Do not offset for page header; that's done below and in RecoverFromPage
@@ -989,12 +839,6 @@ private bool ProcessReadPage(long recoverFromAddress, long untilAddress, long ne
return false;
}
- private void WaitUntilAllPagesHaveBeenFlushed(long startPage, long endPage, RecoveryStatus recoveryStatus)
- {
- for (long page = startPage; page < endPage; page++)
- recoveryStatus.WaitFlush(hlogBase.GetPageIndexForPage(page));
- }
-
private async ValueTask WaitUntilAllPagesHaveBeenFlushedAsync(long startPage, long endPage, RecoveryStatus recoveryStatus, CancellationToken cancellationToken)
{
for (long page = startPage; page < endPage; page++)
@@ -1002,7 +846,7 @@ private async ValueTask WaitUntilAllPagesHaveBeenFlushedAsync(long startPage, lo
}
///
- /// Synchronously recover the hybrid log from snapshot files
+ /// Asynchronously recover the hybrid log from snapshot files.
///
/// The address to start scanning from; the lowest address at which we will bring pages into the circular buffer (may be in the middle of a page)
/// The address from which to perform recovery (undo v+1 records and append to tag-chain tail)
@@ -1011,24 +855,33 @@ private async ValueTask WaitUntilAllPagesHaveBeenFlushedAsync(long startPage, lo
/// The end of the snapshot; the tailAddress at the start of the WAIT_FLUSH phase
/// The next version of the database at the time of checkpoint flush
/// The checkpoint token guid
+ /// The headAddress resulting from the preceding hybrid-log recovery phase (the lowest resident address); seeds eviction tracking here
/// The recovery options
- /// The last freed page, if it was necessary to free any to limit heap memory
- private long RecoverHybridLogFromSnapshotFile(long scanFromAddress, long recoverFromAddress, long untilAddress,
- long snapshotStartAddress, long snapshotEndAddress, long nextVersion, Guid guid, RecoveryOptions options)
+ /// The final headAddress (lowest resident address) after reading the snapshot pages and loading objects
+ private async ValueTask RecoverHybridLogFromSnapshotFileAsync(long scanFromAddress, long recoverFromAddress, long untilAddress,
+ long snapshotStartAddress, long snapshotEndAddress, long nextVersion, Guid guid, long headAddress, RecoveryOptions options,
+ CancellationToken cancellationToken)
{
- long lastFreedPage = NoPageFreed;
GetSnapshotPageRangesToRead(scanFromAddress, untilAddress, snapshotStartAddress, snapshotEndAddress, guid, out long startPage,
out long endPage, out long snapshotEndPage, out var recoveryStatus, out int numPagesToReadPerIteration);
+ // Seed the head from the preceding hybrid-log phase so the snapshot-read loop (TrimLogPages) and the deferred object load
+ // evict from, and track, the correct lowest-resident address across the full recovered range.
+ recoveryStatus.headAddress = headAddress;
+
+ // The snapshot region is the boundary page (the page containing scanFromAddress) and every page above it; pages strictly below it are the
+ // hybrid-log region. RecoverSnapshotPages flushes every snapshot page to the main log AND copies its objects into the main object-log, so
+ // snapshot pages are fully durable and may be evicted during recovery (read-time via TrimLogPages or load-time below) — required to recover
+ // into a smaller memory budget than was checkpointed. The boundary is used below to choose the object-log device for deferred deserialization.
+ var boundaryPageStart = hlogBase.GetLogicalAddressOfStartOfPage(hlogBase.GetPage(scanFromAddress));
+
// Notify application of checkpoint token before processing snapshot records
if (storeFunctions.CallOnDiskRead)
storeFunctions.OnRecovery(guid);
for (long page = startPage; page < endPage; page += numPagesToReadPerIteration)
{
- var (_, freedPage) = ReadPagesForRecovery(snapshotEndAddress, recoveryStatus, snapshotEndPage, numPagesToReadPerIteration, page);
- if (freedPage != NoPageFreed)
- lastFreedPage = freedPage;
+ _ = await ReadPagesForRecoveryAsync(snapshotEndAddress, recoveryStatus, snapshotEndPage, numPagesToReadPerIteration, page, cancellationToken).ConfigureAwait(false);
var end = Math.Min(page + numPagesToReadPerIteration, endPage);
for (long p = page; p < end; p++)
@@ -1037,22 +890,14 @@ private long RecoverHybridLogFromSnapshotFile(long scanFromAddress, long recover
if (p < snapshotEndPage)
{
// Ensure the page is read from file
- recoveryStatus.WaitRead(pageIndex);
-
- if (hlogBase.logSizeTracker is not null)
- {
- // Trim the log memory again in case we read large objects on the current page. Use 0 for numPagesToRead so we don't over-prune.
- freedPage = TrimLogMemorySize(recoveryStatus, tailPage: p + 1, 0);
- if (freedPage != NoPageFreed)
- lastFreedPage = freedPage;
- }
+ await recoveryStatus.WaitReadAsync(pageIndex, cancellationToken).ConfigureAwait(false);
// We make an extra pass to clear locks when reading pages back into memory
- ClearBitsOnPage(p, untilAddress, options, snapshotFromAddress: scanFromAddress);
+ ClearBitsOnPage(p, untilAddress, in options, recoveryStatus.headAddress, snapshotFromAddress: scanFromAddress);
}
else
{
- recoveryStatus.WaitFlush(pageIndex);
+ await recoveryStatus.WaitFlushAsync(pageIndex, cancellationToken).ConfigureAwait(false);
if (!hlogBase.IsAllocated(pageIndex))
hlog.AllocatePage(pageIndex);
else
@@ -1060,88 +905,148 @@ private long RecoverHybridLogFromSnapshotFile(long scanFromAddress, long recover
}
}
- RecoverSnapshotPages(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, options,
+ RecoverSnapshotPages(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, in options,
endPage, snapshotEndPage, numPagesToReadPerIteration, recoveryStatus, page, end);
}
- WaitUntilAllPagesHaveBeenFlushed(startPage, endPage, recoveryStatus);
+ await WaitUntilAllPagesHaveBeenFlushedAsync(startPage, endPage, recoveryStatus, cancellationToken).ConfigureAwait(false);
+
+ // Deferred object load over the full recovered range, honoring the final headAddress. Phase 2 read the snapshot pages as full
+ // pages, so the page containing scanFromAddress (the boundary page, boundaryPageStart) and every page above it were read from the
+ // snapshot device and their live records reference the snapshot object-log; pages strictly below the boundary page were read by the
+ // hybrid-log phase from the main object-log. The device boundary is therefore page-aligned at boundaryPageStart (computed above).
+
+ // Snapshot region (boundary page and above): deserialize resident pages' objects from the snapshot object-log device (the live records
+ // still carry their snapshot positions). These pages are now durable on the main log/object-log (RecoverSnapshotPages copied their objects),
+ // so evict pages as needed to honor the memory budget; an evicted record is simply read back from the main log/object-log on demand.
+ RecoveryLoadObjectsPass2(recoveryStatus, Math.Max(recoveryStatus.headAddress, boundaryPageStart), untilAddress, recoveryStatus.objectLogRecoveryDevice);
+
+ // Hybrid-log region (below the boundary page): read objects from the main object-log device, evicting pages as needed to honor the
+ // memory budget. These pages are durable on the main log/object-log, so an evicted record is simply read back from disk on demand.
+ if (recoveryStatus.headAddress < boundaryPageStart)
+ RecoveryLoadObjectsPass2(recoveryStatus, recoveryStatus.headAddress, boundaryPageStart, objectLogDevice: null);
+
+ // Bring AllocatedPageCount within the hard MaxAllocatedPageCount cap for object-free pages (see TrimResidentPagesToBudget): the per-batch
+ // read-time trim targets the delta-padded highTargetSize and does not run after the final batch, so an inline store can settle one page over.
+ TrimResidentPagesToBudget(recoveryStatus, untilAddress);
+
+ var finalHeadAddress = recoveryStatus.headAddress;
recoveryStatus.Dispose();
- return lastFreedPage;
+ return finalHeadAddress;
}
///
- /// Asynchronously recover the hybrid log from snapshot files
+ /// Load (deserialize) objects for the recovered pages in the address range [, ),
+ /// reading the object log from (null = the main object-log device). The page range is derived from the
+ /// addresses.
///
- /// The address to start scanning from; the lowest address at which we will bring pages into the circular buffer (may be in the middle of a page)
- /// The address from which to perform recovery (undo v+1 records and append to tag-chain tail)
- /// The last address to scan; this is initially the tailAddress at the time of checkpoint flush,
- /// The start of the mutable region; the FlushedUntilAddress at the start of the WAIT_FLUSH phase
- /// The end of the snapshot; the tailAddress at the start of the WAIT_FLUSH phase
- /// The next version of the database at the time of checkpoint flush
- /// The checkpoint token guid
- /// The recovery options
- /// The last freed page, if it was necessary to free any to limit heap memory
- private async ValueTask RecoverHybridLogFromSnapshotFileAsync(long scanFromAddress, long recoverFromAddress, long untilAddress,
- long snapshotStartAddress, long snapshotEndAddress, long nextVersion, Guid guid, RecoveryOptions options,
- CancellationToken cancellationToken)
+ /// The instance; its headAddress is the eviction floor and is advanced as pages are evicted
+ /// The lowest address whose objects are to be loaded (the load floor; pages below it are not loaded by this call)
+ /// The end of the range whose objects are to be loaded
+ /// The object-log device to read from; null means the main object-log device
+ private void RecoveryLoadObjectsPass2(RecoveryStatus recoveryStatus, long fromAddress, long untilAddress, IDevice objectLogDevice)
{
- long lastFreedPage = NoPageFreed;
- GetSnapshotPageRangesToRead(scanFromAddress, untilAddress, snapshotStartAddress, snapshotEndAddress, guid, out long startPage,
- out long endPage, out long snapshotEndPage, out var recoveryStatus, out int numPagesToReadPerIteration);
+ if (fromAddress >= untilAddress)
+ return;
- // Notify application of checkpoint token before processing snapshot records
- if (storeFunctions.CallOnDiskRead)
- storeFunctions.OnRecovery(guid);
+ var startPage = hlogBase.GetPage(fromAddress);
+ var endPage = hlogBase.GetPage(untilAddress);
+ if (untilAddress > hlogBase.GetFirstValidLogicalAddressOnPage(endPage))
+ endPage++;
- for (long page = startPage; page < endPage; page += numPagesToReadPerIteration)
+ // Load all objects from fromAddress to untilAddress with no eviction when there is no size tracker.
+ if (hlogBase.logSizeTracker is null)
{
- var (_, freedPage) = await ReadPagesForRecoveryAsync(snapshotEndAddress, recoveryStatus, snapshotEndPage, numPagesToReadPerIteration, page, cancellationToken).ConfigureAwait(false);
- if (freedPage != NoPageFreed)
- lastFreedPage = freedPage;
- var end = Math.Min(page + numPagesToReadPerIteration, endPage);
+ for (var page = startPage; page < endPage; page++)
+ {
+ var pageIndex = hlogBase.GetPageIndexForPage(page);
+ if (!hlogBase.IsAllocated(pageIndex))
+ continue;
- for (long p = page; p < end; p++)
+ var pageFromAddress = page == startPage ? fromAddress : hlogBase.GetFirstValidLogicalAddressOnPage(page);
+ var pageUntilAddress = page == endPage - 1 ? untilAddress : hlogBase.GetLogicalAddressOfStartOfPage(page + 1);
+ hlogBase.LoadObjectsForRecoveryPass2(page, pageFromAddress, pageUntilAddress, objectLogDevice);
+ }
+ return;
+ }
+
+ // With a size tracker, iterate pages from highest (untilAddress) to lowest (fromAddress) with budget control, evicting pages (and moving headAddress up) as needed.
+ var maxHeadAddress = untilAddress - LogSizeTracker.MinEvictionHeadAddressLag;
+
+ for (var page = endPage - 1; page >= startPage; page--)
+ {
+ var pageIndex = hlogBase.GetPageIndexForPage(page);
+ if (!hlogBase.IsAllocated(pageIndex))
+ continue;
+
+ var pageFromAddress = Math.Max(fromAddress, hlogBase.GetFirstValidLogicalAddressOnPage(page));
+ var pageUntilAddress = page == endPage - 1 ? untilAddress : hlogBase.GetLogicalAddressOfStartOfPage(page + 1);
+ if (pageFromAddress >= pageUntilAddress)
+ continue;
+
+ // Enforce MinEvictionHeadAddressLag: clamp pageFromAddress
+ if (pageFromAddress > maxHeadAddress)
+ pageFromAddress = maxHeadAddress;
+
+ var totalPageObjectSize = hlogBase.CalculatePageObjectSizes(page, pageFromAddress, pageUntilAddress);
+ if (totalPageObjectSize == 0)
{
- int pageIndex = hlogBase.GetPageIndexForPage(p);
- if (p < snapshotEndPage)
- {
- // Ensure the page is read from file
- await recoveryStatus.WaitReadAsync(pageIndex, cancellationToken).ConfigureAwait(false);
+ hlogBase.LoadObjectsForRecoveryPass2(page, pageFromAddress, pageUntilAddress, objectLogDevice);
+ continue;
+ }
- if (hlogBase.logSizeTracker is not null)
- {
- // Trim the log memory again in case we read large objects on the current page. Use 0 for numPagesToRead so we don't over-prune.
- freedPage = await TrimLogMemorySizeAsync(recoveryStatus, tailPage: p + 1, numPagesToRead: 0, cancellationToken).ConfigureAwait(false);
- if (freedPage != NoPageFreed)
- lastFreedPage = freedPage;
- }
+ var remainingBudget = hlogBase.logSizeTracker.RemainingBudget;
+ var pageCutoff = hlogBase.FindHeadAddressCutoffOnPage(page, pageUntilAddress, totalPageObjectSize, (int)(page - hlogBase.GetPage(recoveryStatus.headAddress)), remainingBudget, out var numPagesBelowToEvict);
- // We make an extra pass to clear locks when reading pages back into memory
- ClearBitsOnPage(p, untilAddress, options, snapshotFromAddress: scanFromAddress);
- }
- else
+ // Evict pages below if needed
+ var currentHeadPage = hlogBase.GetPage(recoveryStatus.headAddress);
+ while (numPagesBelowToEvict > 0 && currentHeadPage < page)
+ {
+ var headPageIndex = hlogBase.GetPageIndexForPage(currentHeadPage);
+ if (hlogBase.IsAllocated(headPageIndex))
+ hlogBase.EvictPageForRecovery(currentHeadPage);
+
+ currentHeadPage++;
+ recoveryStatus.headAddress = hlogBase.GetFirstValidLogicalAddressOnPage(currentHeadPage);
+ numPagesBelowToEvict--;
+ }
+
+ // Load objects, using per-record budget checking via DeserializeObjectsOnPage.
+ // The method handles all records from pageCutoff to pageUntilAddress.
+ hlogBase.LoadObjectsForRecoveryPass2(page, pageCutoff, pageUntilAddress, objectLogDevice);
+
+ // After loading, recheck budget. If over budget, evict from headAddress up to and including loaded records.
+ if (hlogBase.logSizeTracker.IsOverBudget && recoveryStatus.headAddress < maxHeadAddress)
+ {
+ // Evict from headAddress upward until under budget or at the lag limit
+ while (hlogBase.logSizeTracker.IsOverBudget && recoveryStatus.headAddress < maxHeadAddress)
{
- await recoveryStatus.WaitFlushAsync(pageIndex, cancellationToken).ConfigureAwait(false);
- if (!hlogBase.IsAllocated(pageIndex))
- hlog.AllocatePage(pageIndex);
- else
- hlogBase.ClearPage(pageIndex);
+ currentHeadPage = hlogBase.GetPage(recoveryStatus.headAddress);
+ if (currentHeadPage >= page)
+ break;
+
+ var headPageIndex = hlogBase.GetPageIndexForPage(currentHeadPage);
+ if (hlogBase.IsAllocated(headPageIndex))
+ hlogBase.EvictPageForRecovery(currentHeadPage);
+
+ recoveryStatus.headAddress = hlogBase.GetFirstValidLogicalAddressOnPage(currentHeadPage + 1);
}
}
- RecoverSnapshotPages(scanFromAddress, recoverFromAddress, untilAddress, nextVersion, options,
- endPage, snapshotEndPage, numPagesToReadPerIteration, recoveryStatus, page, end);
- }
+ // Update headAddress from cutoff if it was raised
+ if (pageCutoff > recoveryStatus.headAddress)
+ recoveryStatus.headAddress = pageCutoff;
- await WaitUntilAllPagesHaveBeenFlushedAsync(startPage, endPage, recoveryStatus, cancellationToken).ConfigureAwait(false);
- recoveryStatus.Dispose();
- return lastFreedPage;
+ // If headAddress is on or above the current page, we're done
+ if (recoveryStatus.headAddress >= hlogBase.GetFirstValidLogicalAddressOnPage(page))
+ break;
+ }
}
///
/// For each page in the snapshot from [page, end), process the page for recovery.
///
- private void RecoverSnapshotPages(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, RecoveryOptions options,
+ private void RecoverSnapshotPages(long scanFromAddress, long recoverFromAddress, long untilAddress, long nextVersion, in RecoveryOptions options,
long endPage, long snapshotEndPage, int numPagesToRead, RecoveryStatus recoveryStatus, long page, long end)
{
for (long p = page; p < end; p++)
@@ -1152,10 +1057,24 @@ private void RecoverSnapshotPages(long scanFromAddress, long recoverFromAddress,
if (recoverFromAddress < endLogicalAddress && recoverFromAddress < untilAddress)
ProcessReadSnapshotPage(recoverFromAddress, untilAddress, nextVersion, options, recoveryStatus, p, pageIndex);
- // Issue next read
- if (p + numPagesToRead < endPage)
+ if (hlogBase.IsObjectAllocator && hlogBase.logSizeTracker is not null)
+ {
+ // Object store under a memory budget (a size tracker is attached, so pages may be evicted during recovery — both read-time via
+ // TrimLogPages and load-time during the deferred object load). Flush every snapshot page to the main log, copying its objects from the
+ // snapshot object-log into the main object-log so the page is fully durable before it can be evicted, letting us recover into a smaller
+ // memory budget than was checkpointed. (Without a size tracker no eviction occurs, so we avoid these writes — which also keeps configs
+ // whose page size exceeds the main-log device segment, that never flush to the main log, working as before.) The objectLogRecoveryDevice
+ // is the snapshot object-log (copy source); the boundary page's flush starts at scanFromAddress, so only its snapshot-region records are
+ // processed.
+ recoveryStatus.flushStatus[pageIndex] = FlushStatus.Pending;
+ hlogBase.AsyncFlushPagesForRecovery(scanFromAddress, p, 1, AsyncFlushPageCallbackForRecovery, recoveryStatus,
+ recoveryStatus.objectLogRecoveryDevice, formerFlushedUntilAddress: scanFromAddress);
+ }
+ else if (!hlogBase.IsObjectAllocator && p + numPagesToRead < endPage)
{
- // Flush snapshot page to main log
+ // String store: records are fully inline, so a snapshot page is durable once written to the main log (no object copy needed) and the
+ // deferred object load is a no-op (so it never evicts). Flush only pages that will be pushed out of the buffer by subsequent reads, so
+ // read-time eviction can reclaim them; the final resident set (the last batch) stays in memory, as before.
recoveryStatus.flushStatus[pageIndex] = FlushStatus.Pending;
hlogBase.AsyncFlushPagesForRecovery(scanFromAddress, p, 1, AsyncFlushPageCallbackForRecovery, recoveryStatus);
}
@@ -1201,13 +1120,17 @@ private void GetSnapshotPageRangesToRead(long scanFromAddress, long untilAddress
recoveryDevicePageOffset = snapshotStartPage
};
- // Initially issue read request for all pages that can be held in memory
- // If heap memory is to be tracked, then read one page at a time to control memory usage
- var totalPagesToRead = (int)(snapshotEndPage - startPage);
- numPagesToReadPerIteration = hlogBase.logSizeTracker is null ? Math.Min(hlogBase.BufferSize, totalPagesToRead) : 1;
+ // Read as many pages as buffer allows, leaving room for at least 1 page for eviction.
+ numPagesToReadPerIteration = Math.Min(hlogBase.BufferSize - 1, (int)(endPage - startPage));
+
+ // Never read more pages per batch than the memory budget allows (see GetPageRangesToRead for the full rationale): BufferSize can exceed
+ // MaxAllocatedPageCount when the budget is not a power-of-two page count, and a full BufferSize-1 batch would fill the circular buffer above
+ // MaxAllocatedPageCount with pages read below the eviction floor that TrimLogPages cannot reclaim.
+ if (hlogBase.logSizeTracker is not null && hlogBase.MaxAllocatedPageCount < numPagesToReadPerIteration)
+ numPagesToReadPerIteration = hlogBase.MaxAllocatedPageCount;
}
- private void ProcessReadSnapshotPage(long recoverFromAddress, long untilAddress, long nextVersion, RecoveryOptions options, RecoveryStatus recoveryStatus, long page, int pageIndex)
+ private void ProcessReadSnapshotPage(long recoverFromAddress, long untilAddress, long nextVersion, in RecoveryOptions options, RecoveryStatus recoveryStatus, long page, int pageIndex)
{
// Page at hand
var startLogicalAddressOfPage = hlogBase.GetLogicalAddressOfStartOfPage(page); // Do not offset for page header; that's done below and in RecoverFromPage
@@ -1244,14 +1167,14 @@ private void ProcessReadSnapshotPage(long recoverFromAddress, long untilAddress,
/// Recovery options (headAddress determines if page is in-memory)
/// If > 0, records at or above this address will get OnRecoverySnapshotRead.
/// Records below this address are main-log records that happened to share the boundary page with the snapshot.
- private void ClearBitsOnPage(long page, long untilAddress, RecoveryOptions options, long snapshotFromAddress = 0)
+ private void ClearBitsOnPage(long page, long untilAddress, in RecoveryOptions options, long headAddress, long snapshotFromAddress = 0)
{
var startLogicalAddress = hlogBase.GetLogicalAddressOfStartOfPage(page);
var endLogicalAddress = hlogBase.GetLogicalAddressOfStartOfPage(page + 1);
var physicalAddress = hlogBase.GetPhysicalAddress(startLogicalAddress);
// no need to clear locks for records that will not end up in main memory
- if (options.headAddress >= endLogicalAddress)
+ if (headAddress >= endLogicalAddress)
return;
var pageSize = hlogBase.GetPageSize();
@@ -1266,9 +1189,8 @@ private void ClearBitsOnPage(long page, long untilAddress, RecoveryOptions optio
{
var recordLogicalAddress = startLogicalAddress + recordOffset;
- // On the snapshot path, skip records below snapshotFromAddress —
- // they are main-log records on the boundary page that were already
- // processed (with OnDiskRead) in the main-log recovery pass.
+ // On the snapshot path, skip records below snapshotFromAddress; they are main-log records on the boundary page
+ // that were already processed (with OnDiskRead) in the main-log recovery pass.
if (snapshotFromAddress == 0 || recordLogicalAddress >= snapshotFromAddress)
{
storeFunctions.OnDiskRead(ref logRecord);
@@ -1303,7 +1225,7 @@ private void ClearBitsOnPage(long page, long untilAddress, RecoveryOptions optio
/// Recovery options
/// True if we touched the page (and thus it needs to be flushed), else false
private unsafe bool RecoverFromPage(long recoverFromAddress, long pageFromAddressOffset, long pageUntilAddressOffset,
- long pageStartLogicalAddress, long pageStartPhysicalAddress, RecoveryOptions options)
+ long pageStartLogicalAddress, long pageStartPhysicalAddress, in RecoveryOptions options)
{
Debug.Assert(pageFromAddressOffset >= hlogBase.pageHeaderSize, $"fromLogicalAddressInPage {pageFromAddressOffset} must be >= hlogBase.pageHeaderSize {hlogBase.pageHeaderSize} (which may be 0)");
Debug.Assert(pageUntilAddressOffset <= hlogBase.GetPageSize(), $"pageSize {pageUntilAddressOffset} must be <= PageSize {hlogBase.GetPageSize()}");
@@ -1374,7 +1296,7 @@ public abstract partial class AllocatorBase : IDisp
where TAllocator : IAllocator
{
///
- /// Restore log; called from TsavoriteLog
+ /// Restore log; called from TsavoriteLog. TODO: This sync version is invoked via BumpCurrentEpoch, which doesn't have async support.
///
///
///
@@ -1454,7 +1376,7 @@ private bool RestoreHybridLogInitializePages(long beginAddress, long headAddress
}
// Passing no objectLogDevice means we'll use the one in the allocator
- AsyncReadPagesForRecovery(headPage, numPages, untilAddress, recoveryStatus);
+ AsyncReadPagesForRecovery(headPage, numPages, untilAddress, recoveryStatus, recoveryPhase: RecoveryPhase.None);
return true;
}
diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Tsavorite/Tsavorite.cs b/libs/storage/Tsavorite/cs/src/core/Index/Tsavorite/Tsavorite.cs
index e2518faa097..b7bd70cd0c7 100644
--- a/libs/storage/Tsavorite/cs/src/core/Index/Tsavorite/Tsavorite.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Index/Tsavorite/Tsavorite.cs
@@ -210,7 +210,7 @@ public TsavoriteKV(KVSettings kvSettings, TStoreFunctions storeFunctions, Func
- /// Recover from the latest valid checkpoint (blocking operation)
- ///
- /// Number of pages to preload into memory (beyond what needs to be read for recovery)
- /// Whether records with versions beyond checkpoint version need to be undone (and invalidated on log)
- /// Version we actually recovered to
- public long Recover(int numPagesToPreload = -1, bool undoNextVersion = true)
- {
- FindRecoveryInfo(-1, out var recoveredHlcInfo, out var recoveredIcInfo);
- return InternalRecover(recoveredIcInfo, recoveredHlcInfo, numPagesToPreload, undoNextVersion);
- }
-
///
/// Get the version we would recover to if we were to request recovery the specified version
///
@@ -409,16 +397,6 @@ public ValueTask RecoverAsync(int numPagesToPreload = -1, bool undoNextVer
return InternalRecoverAsync(recoveredIcInfo, recoveredHlcInfo, numPagesToPreload, undoNextVersion, cancellationToken);
}
- ///
- /// Recover from specific token (blocking operation)
- ///
- /// Token
- /// Number of pages to preload into memory after recovery
- /// Whether records with versions beyond checkpoint version need to be undone (and invalidated on log)
- /// Version we actually recovered to
- public long Recover(Guid fullCheckpointToken, int numPagesToPreload = -1, bool undoNextVersion = true)
- => InternalRecover(fullCheckpointToken, fullCheckpointToken, numPagesToPreload, undoNextVersion);
-
///
/// Asynchronously recover from specific token (blocking operation)
///
@@ -430,17 +408,6 @@ public long Recover(Guid fullCheckpointToken, int numPagesToPreload = -1, bool u
public ValueTask RecoverAsync(Guid fullCheckpointToken, int numPagesToPreload = -1, bool undoNextVersion = true, CancellationToken cancellationToken = default)
=> InternalRecoverAsync(fullCheckpointToken, fullCheckpointToken, numPagesToPreload, undoNextVersion, cancellationToken);
- ///
- /// Recover from specific index and log token (blocking operation)
- ///
- ///
- ///
- /// Number of pages to preload into memory after recovery
- /// Whether records with versions beyond checkpoint version need to be undone (and invalidated on log)
- /// Version we actually recovered to
- public long Recover(Guid indexCheckpointToken, Guid hybridLogCheckpointToken, int numPagesToPreload = -1, bool undoNextVersion = true)
- => InternalRecover(indexCheckpointToken, hybridLogCheckpointToken, numPagesToPreload, undoNextVersion);
-
///
/// Asynchronously recover from specific index and log token (blocking operation)
///
diff --git a/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs b/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs
index c368a93bd16..40d528911f3 100644
--- a/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs
+++ b/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs
@@ -247,7 +247,7 @@ private TsavoriteLog(TsavoriteLogSettings logSettings, bool syncRecover, ILogger
{
try
{
- Recover(-1);
+ RecoverAsync(-1).AsTask().GetAwaiter().GetResult();
}
catch { }
}
@@ -543,8 +543,7 @@ public void Initialize(long beginAddress, long committedUntilAddress, long lastC
CommittedBeginAddress = beginAddress;
// Align monotonic page trackers to the restored address so that post-recovery producer
- // drive and page-shift callbacks re-arm correctly (they only advance beyond the
- // initial floor).
+ // drive and page-shift callbacks re-arm correctly (they only advance beyond the initial floor).
var resetPage = committedUntilAddress >> allocator.LogPageSizeBits;
Volatile.Write(ref lastPublishedSafeTailPage, resetPage);
Volatile.Write(ref lastProducerObservedPage, resetPage);
@@ -567,15 +566,16 @@ public void Initialize(long beginAddress, long committedUntilAddress, long lastC
/// Recover TsavoriteLog to the specific commit number, or latest if -1
///
/// Requested commit number
- public void Recover(long requestedCommitNum = -1)
+ /// Cancellation token
+ public async ValueTask RecoverAsync(long requestedCommitNum = -1, CancellationToken cancellationToken = default)
{
if (CommittedUntilAddress > BeginAddress)
throw new TsavoriteException($"Already recovered until address {CommittedUntilAddress}");
- if (requestedCommitNum == -1)
- RestoreLatest(out RecoveredCookie);
- else
- RestoreSpecificCommit(requestedCommitNum, out RecoveredCookie);
+ RecoveredCookie = requestedCommitNum == -1
+ ? await RestoreLatestAsync(cancellationToken).ConfigureAwait(false)
+ : await RestoreSpecificCommitAsync(requestedCommitNum, cancellationToken).ConfigureAwait(false);
+ persistedCommitNum = commitNum;
}
///
@@ -587,10 +587,7 @@ public static async ValueTask CreateAsync(TsavoriteLogSettings log
{
var log = new TsavoriteLog(logSettings, false);
if (logSettings.TryRecoverLatest)
- {
- var cookie = await log.RestoreLatestAsync(cancellationToken).ConfigureAwait(false);
- log.RecoveredCookie = cookie;
- }
+ await log.RecoverAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
return log;
}
@@ -2754,18 +2751,6 @@ private void SerialCommitCallbackWorker(CommitInfo commitInfo)
_ = (_commitTcs?.TrySetResult(lci));
}
- ///
- /// Synchronously recover instance to TsavoriteLog's latest valid commit, when being used as a readonly log iterator
- ///
- public void RecoverReadOnly()
- {
- if (!readOnlyMode)
- throw new TsavoriteException("This method can only be used with a read-only TsavoriteLog instance used for iteration. Set TsavoriteLogSettings.ReadOnlyMode to true during creation to indicate this.");
-
- RestoreLatest(out _);
- SignalWaitingROIterators();
- }
-
///
/// Asynchronously recover instance to TsavoriteLog's latest commit, when being used as a readonly log iterator
///
@@ -2774,7 +2759,8 @@ public async ValueTask RecoverReadOnlyAsync(CancellationToken cancellationToken
if (!readOnlyMode)
throw new TsavoriteException("This method can only be used with a read-only TsavoriteLog instance used for iteration. Set TsavoriteLogSettings.ReadOnlyMode to true during creation to indicate this.");
- _ = await RestoreLatestAsync(cancellationToken).ConfigureAwait(false);
+ RecoveredCookie = await RestoreLatestAsync(cancellationToken).ConfigureAwait(false);
+ persistedCommitNum = commitNum;
SignalWaitingROIterators();
}
@@ -2814,9 +2800,11 @@ private bool LoadCommitMetadata(long commitNum, out TsavoriteLogRecoveryInfo inf
return true;
}
- private void RestoreLatest(out byte[] cookie)
+ ///
+ /// Restore log asynchronously
+ ///
+ private async ValueTask RestoreLatestAsync(CancellationToken cancellationToken)
{
- cookie = null;
TsavoriteLogRecoveryInfo info = new();
long scanStart = 0;
@@ -2836,7 +2824,7 @@ private void RestoreLatest(out byte[] cookie)
// Only in fast commit mode will we potentially need to recover from an entry in the log
if (fastCommitMode)
{
- // Disable safe guards temporarily
+ // Shut up safe guards, I know what I am doing
CommittedUntilAddress = long.MaxValue;
beginAddress = info.BeginAddress;
allocator.HeadAddress = long.MaxValue;
@@ -2848,19 +2836,18 @@ private void RestoreLatest(out byte[] cookie)
catch { }
}
- // If until address is 0, that means info is still its default value and we haven't been able to recover
+ // if until address is 0, that means info is still its default value and we haven't been able to recover
// from any any commit. Set the log to its start position and return
if (info.UntilAddress == 0)
{
- logger?.LogInformation("Unable to recover using any available commit");
-
- // Reset variables to normal
+ logger?.LogDebug("Unable to recover using any available commit");
+ // Reset things to be something normal lol
allocator.Initialize();
CommittedUntilAddress = FirstValidAddress;
beginAddress = allocator.BeginAddress;
if (readOnlyMode)
allocator.HeadAddress = long.MaxValue;
- return;
+ return null;
}
if (!readOnlyMode)
@@ -2871,33 +2858,32 @@ private void RestoreLatest(out byte[] cookie)
if (headAddress == 0)
headAddress = FirstValidAddress;
-
try
{
- allocator.RestoreHybridLog(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress);
+ await allocator.RestoreHybridLogAsync(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress, cancellationToken: cancellationToken).ConfigureAwait(false);
}
catch
{
- if (!tolerateDeviceFailure) throw;
+ if (!tolerateDeviceFailure)
+ throw;
}
}
CompleteRestoreFromCommit(info);
- cookie = info.Cookie;
+ var cookie = info.Cookie;
commitNum = info.CommitNum;
- // After recovery, persisted commitnum remains 0 so we need to set it to latest commit number
- persistedCommitNum = info.CommitNum;
beginAddress = allocator.BeginAddress;
if (readOnlyMode)
allocator.HeadAddress = long.MaxValue;
if (scanStart > 0)
logCommitManager.OnRecovery(scanStart);
+
+ return cookie;
}
- private void RestoreSpecificCommit(long requestedCommitNum, out byte[] cookie)
+ private async ValueTask RestoreSpecificCommitAsync(long requestedCommitNum, CancellationToken cancellationToken)
{
- cookie = null;
TsavoriteLogRecoveryInfo info = new();
// Find the closest commit metadata with commit num smaller than requested
@@ -2951,84 +2937,13 @@ private void RestoreSpecificCommit(long requestedCommitNum, out byte[] cookie)
headAddress = FirstValidAddress;
try
{
- allocator.RestoreHybridLog(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress);
+ await allocator.RestoreHybridLogAsync(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress, cancellationToken: cancellationToken).ConfigureAwait(false);
}
catch
{
- if (!tolerateDeviceFailure) throw;
- }
- }
-
- CompleteRestoreFromCommit(info);
- cookie = info.Cookie;
- commitNum = persistedCommitNum = info.CommitNum;
- beginAddress = allocator.BeginAddress;
- if (readOnlyMode)
- allocator.HeadAddress = long.MaxValue;
-
- if (scanStart > 0)
- logCommitManager.OnRecovery(scanStart);
- }
-
- ///
- /// Restore log asynchronously
- ///
- private async ValueTask RestoreLatestAsync(CancellationToken cancellationToken)
- {
- TsavoriteLogRecoveryInfo info = new();
-
- long scanStart = 0;
- foreach (var metadataCommit in logCommitManager.ListCommits())
- {
- try
- {
- if (LoadCommitMetadata(metadataCommit, out info))
- {
- scanStart = metadataCommit;
- break;
- }
- }
- catch { }
- }
-
- // Only in fast commit mode will we potentially need to recover from an entry in the log
- if (fastCommitMode)
- {
- // Shut up safe guards, I know what I am doing
- CommittedUntilAddress = long.MaxValue;
- beginAddress = info.BeginAddress;
- allocator.HeadAddress = long.MaxValue;
- try
- {
- using var scanIterator = Scan(info.UntilAddress, long.MaxValue, recover: false);
- _ = scanIterator.ScanForwardForCommit(ref info);
+ if (!tolerateDeviceFailure)
+ throw;
}
- catch { }
- }
-
- // if until address is 0, that means info is still its default value and we haven't been able to recover
- // from any any commit. Set the log to its start position and return
- if (info.UntilAddress == 0)
- {
- logger?.LogDebug("Unable to recover using any available commit");
- // Reset things to be something normal lol
- allocator.Initialize();
- CommittedUntilAddress = FirstValidAddress;
- beginAddress = allocator.BeginAddress;
- if (readOnlyMode)
- allocator.HeadAddress = long.MaxValue;
- return null;
- }
-
- if (!readOnlyMode)
- {
- var headAddress = info.UntilAddress - allocator.GetOffsetOnPage(info.UntilAddress);
- if (info.BeginAddress > headAddress)
- headAddress = info.BeginAddress;
-
- if (headAddress == 0)
- headAddress = FirstValidAddress;
- await allocator.RestoreHybridLogAsync(info.BeginAddress, headAddress, info.UntilAddress, info.UntilAddress, cancellationToken: cancellationToken).ConfigureAwait(false);
}
CompleteRestoreFromCommit(info);
diff --git a/libs/storage/Tsavorite/cs/src/core/Utilities/PageAsyncResultTypes.cs b/libs/storage/Tsavorite/cs/src/core/Utilities/PageAsyncResultTypes.cs
index 9bff4c26769..d67073c9c8a 100644
--- a/libs/storage/Tsavorite/cs/src/core/Utilities/PageAsyncResultTypes.cs
+++ b/libs/storage/Tsavorite/cs/src/core/Utilities/PageAsyncResultTypes.cs
@@ -10,6 +10,8 @@
namespace Tsavorite.core
{
+ internal enum RecoveryPhase : byte { None = 0, Pass1 = 1, Pass2 = 2 }
+
///
/// Result of async page read
///
@@ -47,12 +49,12 @@ public sealed class PageAsyncReadResult
/// The max offset on the main log page to iterate records when determining how many bytes in the ObjectLog to read.
internal long maxAddressOffsetOnPage;
- /// If true, we are called from recovery, and should use the non-transient .
- internal bool isForRecovery;
+ /// The recovery phase for this read. Non- uses the non-transient .
+ internal RecoveryPhase recoveryPhase;
///
public override string ToString()
- => $"page {page}, isRecov {isForRecovery}, devPgOffset {devicePageOffset}, ctx {context}, countdown {handle?.CurrentCount}, destPtr {destinationPtr} (0x{destinationPtr:X}), maxPtr {maxAddressOffsetOnPage}";
+ => $"page {page}, recovPhase {recoveryPhase}, devPgOffset {devicePageOffset}, ctx {context}, countdown {handle?.CurrentCount}, destPtr {destinationPtr} (0x{destinationPtr:X}), maxPtr {maxAddressOffsetOnPage}";
/// Currently nothing to free.
public void Free()
@@ -128,6 +130,14 @@ public sealed class PageAsyncFlushResult
/// If this is set then we are using a different objectLog device from that in the allocator, and do not use the allocator's .
internal ObjectLogFilePositionInfo objectLogFilePositionInfo;
+ /// During snapshot recovery, the snapshot object-log device that is the source for copying object bytes into the main object-log
+ /// (for records at/above ). Null for non-recovery flushes and for the hybrid-log region.
+ internal IDevice recoverySnapshotObjectLogDevice;
+
+ /// During snapshot recovery, the former FlushedUntilAddress (the hybrid-log/snapshot boundary). Records whose logical address is at or
+ /// above this are in the snapshot region and their objects must be copied from the snapshot object-log to the main object-log during the flush.
+ internal long recoveryFormerFlushedUntilAddress;
+
///
public override string ToString()
{
diff --git a/libs/storage/Tsavorite/cs/test/MiscTests.cs b/libs/storage/Tsavorite/cs/test/MiscTests.cs
index d1d85537882..cc9b7112c1f 100644
--- a/libs/storage/Tsavorite/cs/test/MiscTests.cs
+++ b/libs/storage/Tsavorite/cs/test/MiscTests.cs
@@ -3,6 +3,7 @@
using System;
using System.IO;
+using System.Threading.Tasks;
using Garnet.test;
using NUnit.Framework;
using NUnit.Framework.Legacy;
@@ -41,7 +42,7 @@ public void TearDown()
[Test]
[Category("TsavoriteKV")]
- public void ForceRCUAndRecover([Values(UpdateOp.Upsert, UpdateOp.Delete)] UpdateOp updateOp)
+ public async Task ForceRCUAndRecover([Values(UpdateOp.Upsert, UpdateOp.Delete)] UpdateOp updateOp)
{
var copyOnWrite = new FunctionsCopyOnWrite();
ClientSession session = default;
@@ -115,7 +116,7 @@ public void ForceRCUAndRecover([Values(UpdateOp.Upsert, UpdateOp.Delete)] Update
, (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions)
);
- _ = store.Recover(token);
+ _ = await store.RecoverAsync(token).ConfigureAwait(false);
session = store.NewSession(copyOnWrite);
bContext = session.BasicContext;
diff --git a/libs/storage/Tsavorite/cs/test/SharedDirectoryTests.cs b/libs/storage/Tsavorite/cs/test/SharedDirectoryTests.cs
index 4d20b12fbe3..5a7aa8f9e6a 100644
--- a/libs/storage/Tsavorite/cs/test/SharedDirectoryTests.cs
+++ b/libs/storage/Tsavorite/cs/test/SharedDirectoryTests.cs
@@ -51,7 +51,7 @@ public void TearDown()
[Category("TsavoriteKV")]
[Category("CheckpointRestore")]
[Category("Smoke")]
- public async ValueTask SharedLogDirectory([Values] bool isAsync)
+ public async ValueTask SharedLogDirectory()
{
original.Initialize(Path.Join(TestUtils.MethodTestDir, "OriginalCheckpoint"), sharedLogDirectory);
ClassicAssert.IsTrue(IsDirectoryEmpty(sharedLogDirectory)); // sanity check
@@ -72,10 +72,7 @@ public async ValueTask SharedLogDirectory([Values] bool isAsync)
// Recover from original checkpoint
clone.Initialize(cloneCheckpointDirectory, sharedLogDirectory, populateLogHandles: true);
- if (isAsync)
- _ = await clone.Store.RecoverAsync(checkpointGuid).ConfigureAwait(false);
- else
- _ = clone.Store.Recover(checkpointGuid);
+ _ = await clone.Store.RecoverAsync(checkpointGuid).ConfigureAwait(false);
// Both sessions should work concurrently
Test(original, checkpointGuid);
diff --git a/libs/storage/Tsavorite/cs/test/TestUtils.cs b/libs/storage/Tsavorite/cs/test/TestUtils.cs
index 7090cb8e7a1..be9c472189b 100644
--- a/libs/storage/Tsavorite/cs/test/TestUtils.cs
+++ b/libs/storage/Tsavorite/cs/test/TestUtils.cs
@@ -220,8 +220,6 @@ public enum AllocatorType
Object
}
- public enum CompletionSyncMode { Sync, Async }
-
public enum ReadCopyDestination { Tail, ReadCache }
public enum FlushMode { NoFlush, ReadOnly, OnDisk }
diff --git a/libs/storage/Tsavorite/cs/test/test.hlog/FlakyDeviceTests.cs b/libs/storage/Tsavorite/cs/test/test.hlog/FlakyDeviceTests.cs
index bf408ca5295..06735b5a2b0 100644
--- a/libs/storage/Tsavorite/cs/test/test.hlog/FlakyDeviceTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.hlog/FlakyDeviceTests.cs
@@ -203,6 +203,5 @@ public async ValueTask FlakyLogTestTolerateFailure([Values] IteratorType iterato
}
recoveredLog.Dispose();
}
-
}
}
\ No newline at end of file
diff --git a/libs/storage/Tsavorite/cs/test/test.hlog/LogFastCommitTests.cs b/libs/storage/Tsavorite/cs/test/test.hlog/LogFastCommitTests.cs
index 6539699e21b..b04683de1ad 100644
--- a/libs/storage/Tsavorite/cs/test/test.hlog/LogFastCommitTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.hlog/LogFastCommitTests.cs
@@ -5,6 +5,7 @@
using System.Collections.Generic;
using System.IO;
using System.Threading;
+using System.Threading.Tasks;
using NUnit.Framework;
using NUnit.Framework.Legacy;
using Tsavorite.core;
@@ -23,7 +24,7 @@ internal class LogFastCommitTests : TsavoriteLogTestBase
[Test]
[Category("TsavoriteLog")]
[Category("Smoke")]
- public void TsavoriteLogSimpleFastCommitTest([Values] TestUtils.TestDeviceType deviceType)
+ public async Task TsavoriteLogSimpleFastCommitTest([Values] TestUtils.TestDeviceType deviceType)
{
var cookie = new byte[100];
new Random().NextBytes(cookie);
@@ -70,13 +71,13 @@ public void TsavoriteLogSimpleFastCommitTest([Values] TestUtils.TestDeviceType d
// Recovery should still work
var recoveredLog = new TsavoriteLog(logSettings);
- recoveredLog.Recover(1);
+ await recoveredLog.RecoverAsync(1).ConfigureAwait(false);
ClassicAssert.AreEqual(cookie1, recoveredLog.RecoveredCookie);
ClassicAssert.AreEqual(commit1Addr, recoveredLog.TailAddress);
recoveredLog.Dispose();
recoveredLog = new TsavoriteLog(logSettings);
- recoveredLog.Recover(2);
+ await recoveredLog.RecoverAsync(2).ConfigureAwait(false);
ClassicAssert.AreEqual(cookie2, recoveredLog.RecoveredCookie);
ClassicAssert.AreEqual(commit2Addr, recoveredLog.TailAddress);
recoveredLog.Dispose();
diff --git a/libs/storage/Tsavorite/cs/test/test.hlog/LogRecoverReadOnlyTests.cs b/libs/storage/Tsavorite/cs/test/test.hlog/LogRecoverReadOnlyTests.cs
index 985bb96f9e6..b10ebfd071b 100644
--- a/libs/storage/Tsavorite/cs/test/test.hlog/LogRecoverReadOnlyTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.hlog/LogRecoverReadOnlyTests.cs
@@ -50,15 +50,15 @@ public void TearDown()
[Test]
[Category("TsavoriteLog")]
- public async Task RecoverReadOnlyCheck1([Values] bool isAsync)
+ public async Task RecoverReadOnlyCheck1()
{
using var device = Devices.CreateLogDevice(deviceName);
var logSettings = new TsavoriteLogSettings { LogDevice = device, MemorySizeBits = MinKvLogMemorySizeBits, PageSizeBits = MinKvLogPageSizeBits, MutableFraction = 0.5, SegmentSizeBits = MinKvLogPageSizeBits + 1, TryRecoverLatest = false };
- using var log = isAsync ? await TsavoriteLog.CreateAsync(logSettings).ConfigureAwait(false) : new TsavoriteLog(logSettings);
+ using var log = await TsavoriteLog.CreateAsync(logSettings).ConfigureAwait(false);
await Task.WhenAll(ProducerAsync(log, cts),
CommitterAsync(log, cts.Token),
- ReadOnlyConsumerAsync(deviceName, isAsync, cts.Token)).ConfigureAwait(false);
+ ReadOnlyConsumerAsync(deviceName, cts.Token)).ConfigureAwait(false);
}
private async Task ProducerAsync(TsavoriteLog log, CancellationTokenSource cts)
@@ -86,17 +86,17 @@ private static async Task CommitterAsync(TsavoriteLog log, CancellationToken can
catch (OperationCanceledException) { }
}
- // This creates a separate TsavoriteLog over the same log file, using RecoverReadOnly to continuously update
+ // This creates a separate TsavoriteLog over the same log file, using RecoverReadOnlyAsync to continuously update
// to the primary TsavoriteLog's commits.
- private async Task ReadOnlyConsumerAsync(string deviceName, bool isAsync, CancellationToken cancellationToken)
+ private async Task ReadOnlyConsumerAsync(string deviceName, CancellationToken cancellationToken)
{
using var device = Devices.CreateLogDevice(deviceName);
var logSettings = new TsavoriteLogSettings { LogDevice = device, ReadOnlyMode = true, PageSizeBits = MinKvLogPageSizeBits, SegmentSizeBits = MinKvLogPageSizeBits + 1 };
- using var log = isAsync ? await TsavoriteLog.CreateAsync(logSettings, cancellationToken).ConfigureAwait(false) : new TsavoriteLog(logSettings);
+ using var log = await TsavoriteLog.CreateAsync(logSettings, cancellationToken).ConfigureAwait(false);
var _ = BeginRecoverAsyncLoop();
- // This enumerator waits asynchronously when we have reached the committed tail of the duplicate TsavoriteLog. When RecoverReadOnly
+ // This enumerator waits asynchronously when we have reached the committed tail of the duplicate TsavoriteLog. When RecoverReadOnlyAsync
// reads new data committed by the primary TsavoriteLog, it signals commit completion to let iter continue to the new tail.
using var iter = log.Scan(log.BeginAddress, long.MaxValue);
var prevValue = -1L;
@@ -127,12 +127,7 @@ async Task BeginRecoverAsyncLoop()
{
try
{
- if (isAsync)
- {
- await log.RecoverReadOnlyAsync(cancellationToken).ConfigureAwait(false);
- }
- else
- log.RecoverReadOnly();
+ await log.RecoverReadOnlyAsync(cancellationToken).ConfigureAwait(false);
break;
}
catch
@@ -140,7 +135,7 @@ async Task BeginRecoverAsyncLoop()
Thread.Yield();
// retry until timeout
if (DateTimeOffset.UtcNow.Ticks - startTime > TimeSpan.FromSeconds(5).Ticks)
- throw new Exception("Timed out retrying RecoverReadOnly");
+ throw new Exception("Timed out retrying RecoverReadOnlyAsync");
}
}
}
diff --git a/libs/storage/Tsavorite/cs/test/test.hlog/LogTests.cs b/libs/storage/Tsavorite/cs/test/test.hlog/LogTests.cs
index 00b19cd27ae..1a4a33c2480 100644
--- a/libs/storage/Tsavorite/cs/test/test.hlog/LogTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.hlog/LogTests.cs
@@ -1048,7 +1048,7 @@ public void TsavoriteLogSimpleCommitCookieTest([Values] bool fastCommit)
[Test]
[Category("TsavoriteLog")]
- public void TsavoriteLogManualCommitTest()
+ public async Task TsavoriteLogManualCommitTest()
{
device = Devices.CreateLogDevice(Path.Join(MethodTestDir, "logManualCommitTest.log"), deleteOnClose: true);
var logSettings = new TsavoriteLogSettings
@@ -1089,13 +1089,13 @@ public void TsavoriteLogManualCommitTest()
ClassicAssert.IsTrue(commitSuccessful);
var recoveredLog = new TsavoriteLog(logSettings);
- recoveredLog.Recover(1);
+ await recoveredLog.RecoverAsync(1).ConfigureAwait(false);
ClassicAssert.AreEqual(cookie1, recoveredLog.RecoveredCookie);
ClassicAssert.AreEqual(commit1Addr, recoveredLog.TailAddress);
recoveredLog.Dispose();
recoveredLog = new TsavoriteLog(logSettings);
- recoveredLog.Recover(2);
+ await recoveredLog.RecoverAsync(2).ConfigureAwait(false);
ClassicAssert.AreEqual(cookie2, recoveredLog.RecoveredCookie);
ClassicAssert.AreEqual(commit2Addr, recoveredLog.TailAddress);
recoveredLog.Dispose();
@@ -1104,7 +1104,7 @@ public void TsavoriteLogManualCommitTest()
try
{
recoveredLog = new TsavoriteLog(logSettings);
- recoveredLog.Recover(4);
+ await recoveredLog.RecoverAsync(4).ConfigureAwait(false);
Assert.Fail();
}
catch (TsavoriteException)
diff --git a/libs/storage/Tsavorite/cs/test/test.recordops/RecordLifecycleTests.cs b/libs/storage/Tsavorite/cs/test/test.recordops/RecordLifecycleTests.cs
index 6e4531dba20..ad0a7609110 100644
--- a/libs/storage/Tsavorite/cs/test/test.recordops/RecordLifecycleTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recordops/RecordLifecycleTests.cs
@@ -253,7 +253,7 @@ public void CopyUpdateDoesNotFireOnDisposeCopyUpdated()
}
ClassicAssert.AreEqual(0, tracker.DisposeCount(DisposeReason.CopyUpdated),
- "CopyUpdated is handled internally by logSizeTracker — OnDispose must not fire for it");
+ "CopyUpdated is handled internally by logSizeTracker; OnDispose must not fire for it");
ClassicAssert.AreEqual(0, tracker.DisposeCount(DisposeReason.Deleted),
"Deleted must not fire on a CopyUpdate path");
ClassicAssert.AreEqual(0, tracker.TotalEvict(),
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ComponentRecoveryTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ComponentRecoveryTests.cs
index 13e2a08d4c0..2ef4c98818d 100644
--- a/libs/storage/Tsavorite/cs/test/test.recovery/ComponentRecoveryTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/ComponentRecoveryTests.cs
@@ -3,6 +3,7 @@
using System;
using System.IO;
+using System.Threading;
using System.Threading.Tasks;
using Garnet.test;
using NUnit.Framework;
@@ -68,15 +69,12 @@ private static unsafe void Finish_MallocFixedPageSizeRecoveryTest(int seed, IDev
[Test]
[Category("CheckpointRestore")]
[Category("Smoke")]
- public void MallocFixedPageSizeRecoveryTest()
+ public async Task MallocFixedPageSizeRecoveryTest()
{
Setup_MallocFixedPageSizeRecoveryTest(out int seed, out IDevice device, out int numBucketsToAdd, out long[] logicalAddresses, out ulong numBytesWritten);
var recoveredAllocator = new MallocFixedPageSize();
- //issue call to recover
- recoveredAllocator.BeginRecovery(device, 0, numBucketsToAdd, numBytesWritten, out ulong numBytesRead);
- //wait until complete
- recoveredAllocator.IsRecoveryCompleted(true);
+ var numBytesRead = await recoveredAllocator.RecoverAsync(device, 0, numBucketsToAdd, numBytesWritten, CancellationToken.None).ConfigureAwait(false);
Finish_MallocFixedPageSizeRecoveryTest(seed, device, numBucketsToAdd, logicalAddresses, numBytesWritten, recoveredAllocator, numBytesRead);
}
@@ -158,7 +156,7 @@ private static unsafe void Finish_FuzzyIndexRecoveryTest(int seed, long numAdds,
[Test]
[Category("CheckpointRestore")]
[Category("Smoke")]
- public unsafe void FuzzyIndexRecoveryTest()
+ public async Task FuzzyIndexRecoveryTest()
{
Setup_FuzzyIndexRecoveryTest(out int seed, out int size, out long numAdds, out IDevice ht_device, out IDevice ofb_device, out TsavoriteBase hash_table1,
out ulong ht_num_bytes_written, out ulong ofb_num_bytes_written, out int num_ofb_buckets);
@@ -166,10 +164,7 @@ public unsafe void FuzzyIndexRecoveryTest()
var hash_table2 = new TsavoriteBase();
hash_table2.Initialize(size, 512);
- //issue recover call
- hash_table2.RecoverFuzzyIndex(0, ht_device, ht_num_bytes_written, ofb_device, num_ofb_buckets, ofb_num_bytes_written);
- //wait until complete
- hash_table2.IsFuzzyIndexRecoveryComplete(true);
+ await hash_table2.RecoverFuzzyIndexAsync(0, ht_device, ht_num_bytes_written, ofb_device, num_ofb_buckets, ofb_num_bytes_written, CancellationToken.None).ConfigureAwait(false);
Finish_FuzzyIndexRecoveryTest(seed, numAdds, ht_device, ofb_device, hash_table1, hash_table2);
}
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/LargeObjectTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/LargeObjectTests.cs
index e01ee15c67a..452f103cc87 100644
--- a/libs/storage/Tsavorite/cs/test/test.recovery/LargeObjectTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/LargeObjectTests.cs
@@ -68,7 +68,7 @@ public async ValueTask LargeObjectTest([Values(CheckpointType.Snapshot, Checkpoi
, StoreFunctions.Create(new TestObjectKey.Comparer(), () => new TestLargeObjectValue.Serializer())
, (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions)))
{
- _ = store.Recover(token);
+ _ = await store.RecoverAsync(token).ConfigureAwait(false);
using (var session = store.NewSession(new TestLargeObjectFunctions()))
DoRead(session, numObjects, store);
@@ -147,7 +147,7 @@ public async ValueTask MultiListObjectTest([Values(CheckpointType.Snapshot, Chec
DoRead(session, numObjects, store);
_ = store.TryInitiateFullCheckpoint(out token, checkpointType);
- await store.CompleteCheckpointAsync();
+ await store.CompleteCheckpointAsync().ConfigureAwait(false);
}
// Step 1: Create and recover store.
@@ -157,7 +157,7 @@ public async ValueTask MultiListObjectTest([Values(CheckpointType.Snapshot, Chec
, StoreFunctions.Create(new TestObjectKey.Comparer(), () => new TestMultiListObjectValue.Serializer())
, (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions)))
{
- _ = store.Recover(token);
+ _ = await store.RecoverAsync(token).ConfigureAwait(false);
using (var session = store.NewSession(new TestMultiListObjectFunctions()))
DoRead(session, numObjects, store);
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoverySnapshotEvictionTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoverySnapshotEvictionTests.cs
new file mode 100644
index 00000000000..1b0d3c27693
--- /dev/null
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoverySnapshotEvictionTests.cs
@@ -0,0 +1,207 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.IO;
+using System.Threading.Tasks;
+using Garnet.test;
+using NUnit.Framework;
+using NUnit.Framework.Legacy;
+using Tsavorite.core;
+using static Tsavorite.test.TestUtils;
+
+namespace Tsavorite.test.recovery.objects
+{
+ using ClassAllocator = ObjectAllocator>;
+ using ClassStoreFunctions = StoreFunctions;
+
+ ///
+ /// Exercises the deferred object-load path of snapshot recovery (see RecoverHybridLogFromSnapshotFileAsync):
+ /// the hybrid-log phase reads its pages without loading their objects, then after the snapshot pages have also been
+ /// read (without their objects), objects are loaded once over the full recovered range honoring the final headAddress.
+ /// The recovered range spans both the hybrid-log region (objects in the main object-log) and the snapshot region
+ /// (objects in the snapshot object-log), with the device boundary at the page that contains FlushedUntilAddress.
+ /// A is optionally attached to the recovery store to force
+ /// eviction during the deferred load, covering: no eviction (both region loads run over resident pages), partial
+ /// eviction (headAddress stays in the hybrid-log region so both loads run), and heavy eviction (headAddress is pushed
+ /// into the snapshot region so only the snapshot-region load runs). A non-power-of-2 buffer is also covered.
+ ///
+ [TestFixture]
+ public class ObjectRecoverySnapshotEvictionTests : TestBase
+ {
+ const int NumRecords = 6000;
+
+ [SetUp]
+ public void Setup() => RecreateDirectory(MethodTestDir);
+
+ [TearDown]
+ public void TearDown() => TestUtils.OnTearDown();
+
+ // recoveryTargetPageCount: 0 => no size tracker (no eviction); otherwise attach a tracker whose target is that many
+ // pages, forcing eviction during recovery. 4 is the minimum (LogSizeTracker.MinTargetPageCount). Small values force
+ // the snapshot-only load; larger values leave the head in the hybrid-log region so both region loads run.
+ // logMemoryPages: the max allocated page count; 24 is not a power of two, so BufferSize (next power of two = 32)
+ // has empty slots that the load loop must skip.
+ [Test]
+ [Category("TsavoriteKV"), Category("CheckpointRestore")]
+ public async Task SnapshotRecoveryDeferredObjectLoad(
+ [Values(0, 4, 8, 20, 64)] int recoveryTargetPageCount,
+ [Values(32, 24)] int logMemoryPages)
+ {
+ var logMemorySize = (long)logMemoryPages * MinKvLogPageSize;
+
+ // Write records (spanning many pages so some are flushed to the main log before the checkpoint, creating a
+ // hybrid-log region) and take a Snapshot checkpoint capturing the still-mutable region as the snapshot region.
+ Prepare(logMemorySize, out var log, out var objlog, out var store);
+ try
+ {
+ using (var session = store.NewSession(new TestObjectFunctions()))
+ {
+ var bContext = session.BasicContext;
+ for (var i = 0; i < NumRecords; i++)
+ _ = bContext.Upsert(new TestObjectKey { key = i }, new TestObjectValue { value = i });
+ }
+
+ _ = store.TryInitiateHybridLogCheckpoint(out var token, CheckpointType.Snapshot);
+ await store.CompleteCheckpointAsync().AsTask().ConfigureAwait(false);
+ Destroy(log, objlog, store);
+
+ // Recover into a fresh store, optionally under memory pressure so the deferred object load must evict.
+ Prepare(logMemorySize, out log, out objlog, out store);
+ if (recoveryTargetPageCount > 0)
+ {
+ var targetSize = (long)recoveryTargetPageCount * MinKvLogPageSize;
+ var tracker = new LogSizeTracker(store.Log, targetSize, targetSize / 8, targetSize / 16, logger: null);
+ store.Log.SetLogSizeTracker(tracker);
+ }
+
+ _ = await store.RecoverAsync(default, token).ConfigureAwait(false);
+
+ // Every record must recover correctly, whether it ended up resident or was evicted (and is read from disk).
+ using (var session = store.NewSession(new TestObjectFunctions()))
+ {
+ var bContext = session.BasicContext;
+ for (var i = 0; i < NumRecords; i++)
+ {
+ var key = new TestObjectKey { key = i };
+ TestObjectInput input = default;
+ TestObjectOutput output = new();
+ var status = bContext.Read(key, ref input, ref output);
+ if (status.IsPending)
+ {
+ Assert.That(bContext.CompletePendingWithOutputs(out var completedOutputs, wait: true), Is.True);
+ (status, output) = GetSinglePendingResult(completedOutputs);
+ }
+
+ ClassicAssert.IsTrue(status.Found, $"key {i} not found (target pages {recoveryTargetPageCount}, mem pages {logMemoryPages})");
+ ClassicAssert.AreEqual(i, output.value.value, $"key {i} wrong value");
+ }
+ }
+
+ // With a small memory budget, eviction must have advanced the head above the begin address.
+ if (recoveryTargetPageCount is > 0 and <= 8)
+ ClassicAssert.Greater(store.Log.HeadAddress, store.Log.BeginAddress, "expected eviction to advance HeadAddress");
+ }
+ finally
+ {
+ Destroy(log, objlog, store);
+ }
+ }
+
+ // After recovering an object store into a smaller memory budget (so snapshot object pages are evicted and their objects are read back from the
+ // main object-log that RecoverSnapshotPages copied them into), compact the log and truncate it, then verify every record is still readable.
+ // Compaction reads each live record's objects from the main object-log (validating the copied positions), and Truncate drops the now-stale main-log
+ // and object-log segments using each page header's lowest-object-log position (which the recovery flush set to the main object-log).
+ [Test]
+ [Category("TsavoriteKV"), Category("CheckpointRestore")]
+ public async Task SnapshotRecoveryThenCompactTruncate(
+ [Values] CompactionType compactionType,
+ [Values(32, 24)] int logMemoryPages)
+ {
+ var logMemorySize = (long)logMemoryPages * MinKvLogPageSize;
+
+ Prepare(logMemorySize, out var log, out var objlog, out var store);
+ try
+ {
+ using (var session = store.NewSession(new TestObjectFunctions()))
+ {
+ var bContext = session.BasicContext;
+ for (var i = 0; i < NumRecords; i++)
+ _ = bContext.Upsert(new TestObjectKey { key = i }, new TestObjectValue { value = i });
+ }
+
+ _ = store.TryInitiateHybridLogCheckpoint(out var token, CheckpointType.Snapshot);
+ store.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult();
+ Destroy(log, objlog, store);
+
+ // Recover under memory pressure so snapshot object pages are evicted during recovery (their objects must be read back from the main object-log).
+ Prepare(logMemorySize, out log, out objlog, out store);
+ var targetSize = 8L * MinKvLogPageSize;
+ var tracker = new LogSizeTracker(store.Log, targetSize, targetSize / 8, targetSize / 16, logger: null);
+ store.Log.SetLogSizeTracker(tracker);
+
+ _ = await store.RecoverAsync(default, token).ConfigureAwait(false);
+
+ // Recovery has forced eviction of snapshot object pages (their objects were copied into the main object-log). Relax the budget before
+ // compaction so the tight recovery target does not starve Compact's allocation (Compact copies live records to the tail); the log still
+ // spills to disk via its normal LogMemorySize-driven eviction, so compaction continues to read evicted records' objects from the main object-log.
+ tracker.UpdateTargetSize(1L << 30, 1L << 27, 1L << 26);
+
+ using (var session = store.NewSession(new TestObjectFunctions()))
+ {
+ var bContext = session.BasicContext;
+
+ // Compact the entire recovered region (reading each live record's objects from the main object-log), then truncate the stale segments.
+ var compactUntil = session.Compact(store.Log.TailAddress, compactionType);
+ store.Log.Truncate();
+ ClassicAssert.AreEqual(compactUntil, store.Log.BeginAddress, "BeginAddress should advance to compactUntil after Truncate");
+
+ // Every record must still be readable after Compact + Truncate.
+ for (var i = 0; i < NumRecords; i++)
+ {
+ var key = new TestObjectKey { key = i };
+ TestObjectInput input = default;
+ TestObjectOutput output = new();
+ var status = bContext.Read(key, ref input, ref output);
+ if (status.IsPending)
+ {
+ Assert.That(bContext.CompletePendingWithOutputs(out var completedOutputs, wait: true), Is.True);
+ (status, output) = GetSinglePendingResult(completedOutputs);
+ }
+
+ ClassicAssert.IsTrue(status.Found, $"key {i} not found after compact/truncate (compactionType {compactionType}, mem pages {logMemoryPages})");
+ ClassicAssert.AreEqual(i, output.value.value, $"key {i} wrong value after compact/truncate");
+ }
+ }
+ }
+ finally
+ {
+ Destroy(log, objlog, store);
+ }
+ }
+
+ private static void Prepare(long logMemorySize, out IDevice log, out IDevice objlog, out TsavoriteKV store)
+ {
+ log = Devices.CreateLogDevice(Path.Combine(MethodTestDir, "snapevict.log"));
+ objlog = Devices.CreateLogDevice(Path.Combine(MethodTestDir, "snapevict.obj.log"));
+ store = new(new()
+ {
+ IndexSize = 1L << 22,
+ LogDevice = log,
+ ObjectLogDevice = objlog,
+ SegmentSize = 1L << 20,
+ LogMemorySize = logMemorySize,
+ PageSize = MinKvLogPageSize,
+ CheckpointDir = Path.Combine(MethodTestDir, "check-points")
+ }, StoreFunctions.Create(new TestObjectKey.Comparer(), () => new TestObjectValue.Serializer())
+ , (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions)
+ );
+ }
+
+ private static void Destroy(IDevice log, IDevice objlog, TsavoriteKV store)
+ {
+ store.Dispose();
+ log.Dispose();
+ objlog.Dispose();
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest.cs
index a263af618fc..63e5d9cf6a8 100644
--- a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest.cs
@@ -11,7 +11,6 @@
namespace Tsavorite.test.recovery.objects
{
- using static Tsavorite.test.TestUtils;
using ClassAllocator = ObjectAllocator>;
using ClassStoreFunctions = StoreFunctions;
@@ -78,15 +77,12 @@ private void PrepareToRecover()
[Test]
[Category("TsavoriteKV"), Category("CheckpointRestore")]
- public async ValueTask ObjectRecoveryTest1([Values] CompletionSyncMode syncMode)
+ public async ValueTask ObjectRecoveryTest1()
{
Populate();
PrepareToRecover();
- if (syncMode == CompletionSyncMode.Async)
- _ = await store.RecoverAsync(token, token).ConfigureAwait(false);
- else
- _ = store.Recover(token, token);
+ _ = await store.RecoverAsync(token, token).ConfigureAwait(false);
Verify(token, token);
}
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest2.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest2.cs
index 48efb240fc0..c77171d8be1 100644
--- a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest2.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest2.cs
@@ -38,8 +38,7 @@ public void TearDown()
public async ValueTask ObjectRecoveryTest2(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Range(300, 700, 300)] int numberOfRecords,
- [Values] CompletionSyncMode syncMode)
+ [Range(300, 700, 300)] int numberOfRecords)
{
this.numberOfRecords = numberOfRecords;
@@ -53,7 +52,7 @@ public async ValueTask ObjectRecoveryTest2(
session.Dispose();
_ = store.TryInitiateFullCheckpoint(out var guid, checkpointType); // guid is useful for debugging, but not otherwise used in this test
- await store.CompleteCheckpointAsync();
+ await store.CompleteCheckpointAsync().ConfigureAwait(false);
Destroy(log, objlog, store);
}
@@ -62,10 +61,7 @@ public async ValueTask ObjectRecoveryTest2(
{
Prepare(out var log, out var objlog, out var store);
- if (syncMode == CompletionSyncMode.Async)
- _ = await store.RecoverAsync().ConfigureAwait(false);
- else
- _ = store.Recover();
+ _ = await store.RecoverAsync().ConfigureAwait(false);
var session = store.NewSession(new TestObjectFunctions());
Read(session, delete: true);
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest3.cs b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest3.cs
index 1f2483c04e5..905834b63c0 100644
--- a/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest3.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/ObjectRecoveryTest3.cs
@@ -36,8 +36,7 @@ public void TearDown()
[Category("TsavoriteKV"), Category("CheckpointRestore")]
public async ValueTask ObjectRecoveryTest3(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values(1000)] int iterations,
- [Values] CompletionSyncMode syncMode)
+ [Values(1000)] int iterations)
{
this.iterations = iterations;
Prepare(out IDevice log, out IDevice objlog, out var store);
@@ -56,10 +55,7 @@ public async ValueTask ObjectRecoveryTest3(
{
Prepare(out log, out objlog, out store);
- if (syncMode == CompletionSyncMode.Async)
- _ = await store.RecoverAsync(default, item.Item2).ConfigureAwait(false);
- else
- _ = store.Recover(default, item.Item2);
+ _ = await store.RecoverAsync(default, item.Item2).ConfigureAwait(false);
var session2 = store.NewSession(new TestObjectFunctions());
Read(session2, false, item.Item1);
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryCheckTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryCheckTests.cs
index a1efb3cb892..5b438859715 100644
--- a/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryCheckTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryCheckTests.cs
@@ -98,7 +98,7 @@ public class RecoveryCheck1Tests : RecoveryCheckBase
public async ValueTask RecoveryCheck1(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize)
+ [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize)
{
const long pageSize = MinKvLogPageSize;
using var store1 = new TsavoriteKV(new()
@@ -168,16 +168,8 @@ public async ValueTask RecoveryCheck1(
, (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions)
);
- if (completionSyncMode == CompletionSyncMode.Async)
- {
- var (status, token) = await task;
- _ = await store2.RecoverAsync(default, token);
- }
- else
- {
- var (status, token) = task.AsTask().GetAwaiter().GetResult();
- _ = store2.Recover(default, token);
- }
+ var (_, token) = await task.ConfigureAwait(false);
+ _ = await store2.RecoverAsync(default, token).ConfigureAwait(false);
ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress);
ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress);
@@ -218,7 +210,7 @@ public class RecoveryCheck2Tests : RecoveryCheckBase
//[Repeat(3000)]
public async ValueTask RecoveryCheck2(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize)
+ [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize)
{
const long pageSize = MinKvLogPageSize;
using var store1 = new TsavoriteKV(new()
@@ -289,16 +281,8 @@ public async ValueTask RecoveryCheck2(
var task = store1.TakeHybridLogCheckpointAsync(checkpointType);
- if (completionSyncMode == CompletionSyncMode.Async)
- {
- var (status, token) = await task;
- _ = await store2.RecoverAsync(default, token);
- }
- else
- {
- var (status, token) = task.AsTask().GetAwaiter().GetResult();
- _ = store2.Recover(default, token);
- }
+ var (_, token) = await task.ConfigureAwait(false);
+ _ = await store2.RecoverAsync(default, token).ConfigureAwait(false);
ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress, $"iter {iter}");
ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress, $"iter {iter}");
@@ -328,7 +312,7 @@ public async ValueTask RecoveryCheck2(
[Test]
[Category("TsavoriteKV"), Category("CheckpointRestore")]
- public void RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType)
+ public async Task RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType)
{
Guid token = default;
const long pageSize = MinKvLogPageSize;
@@ -351,7 +335,7 @@ public void RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointTy
);
if (iter > 0)
- _ = store.Recover(default, token);
+ _ = await store.RecoverAsync(default, token).ConfigureAwait(false);
using var s1 = store.NewSession(new SimpleLongSimpleFunctions());
var bc1 = s1.BasicContext;
@@ -367,7 +351,7 @@ public void RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointTy
var task = store.TakeHybridLogCheckpointAsync(checkpointType);
bool success;
- (success, token) = task.AsTask().GetAwaiter().GetResult();
+ (success, token) = await task.ConfigureAwait(false);
ClassicAssert.IsTrue(success);
using var s2 = store.NewSession(new SimpleLongSimpleFunctions());
@@ -395,7 +379,7 @@ public void RecoveryCheck2Repeated([Values(CheckpointType.Snapshot, CheckpointTy
[Test]
[Category("TsavoriteKV"), Category("CheckpointRestore")]
- public void RecoveryRollback([Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType)
+ public async Task RecoveryRollback([Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType)
{
const long pageSize = MinKvLogPageSize;
using var store = new TsavoriteKV(new()
@@ -418,7 +402,7 @@ public void RecoveryRollback([Values(CheckpointType.Snapshot, CheckpointType.Fol
_ = bc1.Upsert(TestSpanByteKey.FromPinnedSpan(SpanByte.FromPinnedVariable(ref key)), SpanByte.FromPinnedVariable(ref key));
var task = store.TakeHybridLogCheckpointAsync(checkpointType);
- (bool success, Guid token) = task.AsTask().GetAwaiter().GetResult();
+ (bool success, Guid token) = await task.ConfigureAwait(false);
ClassicAssert.IsTrue(success);
for (long key = 0; key < 1000; key++)
@@ -455,7 +439,7 @@ public void RecoveryRollback([Values(CheckpointType.Snapshot, CheckpointType.Fol
}
// Rollback to previous checkpoint
- _ = store.Recover(default, token);
+ _ = await store.RecoverAsync(default, token).ConfigureAwait(false);
for (long key = 0; key < 1000; key++)
{
@@ -515,7 +499,7 @@ public class RecoveryCheck3Tests : RecoveryCheckBase
[Category("TsavoriteKV"), Category("CheckpointRestore")]
public async ValueTask RecoveryCheck3(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize)
+ [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize)
{
const long pageSize = MinKvLogPageSize;
using var store1 = new TsavoriteKV(new()
@@ -586,16 +570,8 @@ public async ValueTask RecoveryCheck3(
var task = store1.TakeFullCheckpointAsync(checkpointType);
- if (completionSyncMode == CompletionSyncMode.Async)
- {
- var (status, token) = await task;
- _ = await store2.RecoverAsync(default, token);
- }
- else
- {
- var (status, token) = task.AsTask().GetAwaiter().GetResult();
- _ = store2.Recover(default, token);
- }
+ var (_, token) = await task.ConfigureAwait(false);
+ _ = await store2.RecoverAsync(default, token).ConfigureAwait(false);
ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress, $"iter {iter}");
ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress, $"iter {iter}");
@@ -637,7 +613,7 @@ public class RecoveryCheck4Tests : RecoveryCheckBase
[Category("TsavoriteKV"), Category("CheckpointRestore")]
public async ValueTask RecoveryCheck4(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize)
+ [Values] ReadCacheMode readCacheMode, [Values(1L << 13, 1L << 16)] long indexSize)
{
const long pageSize = MinKvLogPageSize;
using var store1 = new TsavoriteKV(new()
@@ -710,16 +686,8 @@ public async ValueTask RecoveryCheck4(
_ = store1.TakeIndexCheckpointAsync().AsTask().GetAwaiter().GetResult();
var task = store1.TakeHybridLogCheckpointAsync(checkpointType);
- if (completionSyncMode == CompletionSyncMode.Async)
- {
- var (status, token) = await task;
- _ = await store2.RecoverAsync(default, token);
- }
- else
- {
- var (status, token) = task.AsTask().GetAwaiter().GetResult();
- _ = store2.Recover(default, token);
- }
+ var (_, token) = await task.ConfigureAwait(false);
+ _ = await store2.RecoverAsync(default, token).ConfigureAwait(false);
ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress, $"iter {iter}");
ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress, $"iter {iter}");
@@ -761,7 +729,7 @@ public class RecoveryCheck5Tests : RecoveryCheckBase
[Category("CheckpointRestore")]
public async ValueTask RecoveryCheck5(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values] bool isAsync, [Values] bool useReadCache, [Values(1L << 13, 1L << 16)] long indexSize)
+ [Values] bool useReadCache, [Values(1L << 13, 1L << 16)] long indexSize)
{
const long pageSize = MinKvLogPageSize;
using var store1 = new TsavoriteKV(new()
@@ -816,7 +784,7 @@ public async ValueTask RecoveryCheck5(
}
}
- var result = await store1.GrowIndexAsync();
+ var result = await store1.GrowIndexAsync().ConfigureAwait(false);
ClassicAssert.IsTrue(result);
for (long key = 0; key < 1000; key++)
@@ -852,16 +820,8 @@ public async ValueTask RecoveryCheck5(
, (allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions)
);
- if (isAsync)
- {
- var (status, token) = await task;
- _ = await store2.RecoverAsync(default, token);
- }
- else
- {
- var (status, token) = task.AsTask().GetAwaiter().GetResult();
- _ = store2.Recover(default, token);
- }
+ var (_, token) = await task.ConfigureAwait(false);
+ _ = await store2.RecoverAsync(default, token).ConfigureAwait(false);
ClassicAssert.AreEqual(store1.Log.HeadAddress, store2.Log.HeadAddress);
ClassicAssert.AreEqual(store1.Log.ReadOnlyAddress, store2.Log.ReadOnlyAddress);
@@ -951,7 +911,7 @@ public void OnStop(bool completed, long numberOfRecords)
[Category("CheckpointRestore")]
[Category("Smoke")]
- public async ValueTask StreamingSnapshotBasicTest([Values] CompletionSyncMode completionSyncMode, [Values] ReadCacheMode readCacheMode,
+ public async ValueTask StreamingSnapshotBasicTest([Values] ReadCacheMode readCacheMode,
[Values] bool reInsert, [Values(1L << 13, 1L << 16)] long indexSize)
{
using var store1 = new TsavoriteKV(new()
@@ -1049,10 +1009,7 @@ public async ValueTask StreamingSnapshotBasicTest([Values] CompletionSyncMode co
// Take a streaming snapshot checkpoint of the old store
var iterator = new SnapshotIterator(store2, 1000);
var task = store1.TakeFullCheckpointAsync(CheckpointType.StreamingSnapshot, streamingSnapshotIteratorFunctions: iterator);
- if (completionSyncMode == CompletionSyncMode.Async)
- _ = await task;
- else
- _ = task.AsTask().GetAwaiter().GetResult();
+ _ = await task.ConfigureAwait(false);
// Verify that the new store has all the records
using var s2 = store2.NewSession(new MyFunctions());
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryTests.cs b/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryTests.cs
index c60658a922b..378680bf65c 100644
--- a/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/RecoveryTests.cs
@@ -81,7 +81,7 @@ private void PrepareToRecover(TestDeviceType deviceType)
[Test]
[Category("TsavoriteKV")]
[Category("CheckpointRestore")]
- public async ValueTask RecoveryTestSeparateCheckpoint([Values] CompletionSyncMode syncMode, [Values] TestDeviceType deviceType)
+ public async ValueTask RecoveryTestSeparateCheckpoint([Values] TestDeviceType deviceType)
{
Setup(deviceType);
Populate(SeparateCheckpointAction);
@@ -90,7 +90,7 @@ public async ValueTask RecoveryTestSeparateCheckpoint([Values] CompletionSyncMod
{
if (i >= indexTokens.Count) break;
PrepareToRecover(deviceType);
- await RecoverAndTest(i, syncMode == CompletionSyncMode.Async).ConfigureAwait(false);
+ await RecoverAndTest(i).ConfigureAwait(false);
}
}
@@ -98,7 +98,7 @@ public async ValueTask RecoveryTestSeparateCheckpoint([Values] CompletionSyncMod
[Category("TsavoriteKV")]
[Category("CheckpointRestore")]
[Category("Smoke")]
- public async ValueTask RecoveryTestFullCheckpoint([Values] CompletionSyncMode syncMode, [Values] TestDeviceType deviceType)
+ public async ValueTask RecoveryTestFullCheckpoint([Values] TestDeviceType deviceType)
{
Setup(deviceType);
Populate(FullCheckpointAction);
@@ -106,7 +106,7 @@ public async ValueTask RecoveryTestFullCheckpoint([Values] CompletionSyncMode sy
for (var i = 0; i < logTokens.Count; i++)
{
PrepareToRecover(deviceType);
- await RecoverAndTest(i, syncMode == CompletionSyncMode.Async).ConfigureAwait(false);
+ await RecoverAndTest(i).ConfigureAwait(false);
}
}
@@ -171,16 +171,13 @@ private void Populate(Action checkpointAction)
_ = bContext.CompletePending(true);
}
- private async ValueTask RecoverAndTest(int tokenIndex, bool isAsync)
+ private async ValueTask RecoverAndTest(int tokenIndex)
{
var logToken = logTokens[tokenIndex];
var indexToken = indexTokens[tokenIndex];
// Recover
- if (isAsync)
- _ = await store.RecoverAsync(indexToken, logToken).ConfigureAwait(false);
- else
- _ = store.Recover(indexToken, logToken);
+ _ = await store.RecoverAsync(indexToken, logToken).ConfigureAwait(false);
// Create array for reading
var inputArray = GC.AllocateArray((int)NumUniqueKeys, pinned: true);
@@ -293,32 +290,32 @@ private TsavoriteKV PrepareToRecover RunTest(allocatorType,
() => StoreFunctions.Create(new AdId.Comparer(), SpanByteRecordTriggers.Instance),
(allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions),
- Populate, Read, Recover, isAsync),
+ Populate, Read, Recover),
AllocatorType.Object => RunTest(allocatorType,
() => StoreFunctions.Create(new TestObjectKey.Comparer(), () => new TestObjectValue.Serializer(), DefaultRecordTriggers.Instance),
(allocatorSettings, storeFunctions) => new(allocatorSettings, storeFunctions),
- Populate, Read, Recover, isAsync),
+ Populate, Read, Recover),
_ => throw new ApplicationException("Unknown allocator type"),
};
;
@@ -329,8 +326,7 @@ private async ValueTask RunTest(AllocatorType alloc
Func storeFunctionsCreator, Func allocatorCreator,
Action> populateAction,
Action> readAction,
- Func, bool, ValueTask> recoverFunc,
- bool isAsync)
+ Func, ValueTask> recoverFunc)
where TStoreFunctions : IStoreFunctions
where TAllocator : IAllocator
{
@@ -339,18 +335,18 @@ private async ValueTask RunTest(AllocatorType alloc
readAction(store);
if (smallSector)
{
- _ = Assert.ThrowsAsync(async () => await Checkpoint(store, isAsync).ConfigureAwait(false));
+ _ = Assert.ThrowsAsync(async () => await Checkpoint(store).ConfigureAwait(false));
Assert.Pass("Verified expected exception on mismatched sector sizes; the test cannot continue, so exiting early with success");
}
else
- await Checkpoint(store, isAsync).ConfigureAwait(false);
+ await Checkpoint(store).ConfigureAwait(false);
ClassicAssert.AreNotEqual(Guid.Empty, logToken);
ClassicAssert.AreNotEqual(Guid.Empty, indexToken);
readAction(store);
store = PrepareToRecover(allocatorType, storeFunctionsCreator, allocatorCreator);
- await recoverFunc(store, isAsync).ConfigureAwait(false);
+ await recoverFunc(store).ConfigureAwait(false);
readAction(store);
}
@@ -399,27 +395,19 @@ private unsafe void Populate(TsavoriteKV st
_ = bContext.CompletePending(true);
}
- private async ValueTask Checkpoint(TsavoriteKV store, bool isAsync)
+ private async ValueTask Checkpoint(TsavoriteKV store)
where TStoreFunctions : IStoreFunctions
where TAllocator : IAllocator
{
- if (isAsync)
- {
- var (success, token) = await store.TakeFullCheckpointAsync(CheckpointType.Snapshot).ConfigureAwait(false);
- ClassicAssert.IsTrue(success);
- logToken = token;
- }
- else
- {
- while (!store.TryInitiateFullCheckpoint(out logToken, CheckpointType.Snapshot)) { }
- store.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult();
- }
+ var (success, token) = await store.TakeFullCheckpointAsync(CheckpointType.Snapshot).ConfigureAwait(false);
+ ClassicAssert.IsTrue(success);
+ logToken = token;
indexToken = logToken;
}
- private async ValueTask RecoverAndReadTest(TsavoriteKV store, bool isAsync)
+ private async ValueTask RecoverAndReadTest(TsavoriteKV store)
{
- await Recover(store, isAsync).ConfigureAwait(false);
+ await Recover(store).ConfigureAwait(false);
Read(store);
}
@@ -446,9 +434,9 @@ private static void Read(TsavoriteKV st
}
}
- private async ValueTask RecoverAndReadTest(TsavoriteKV store, bool isAsync)
+ private async ValueTask RecoverAndReadTest(TsavoriteKV store)
{
- await Recover(store, isAsync).ConfigureAwait(false);
+ await Recover(store).ConfigureAwait(false);
Read(store);
}
@@ -467,14 +455,11 @@ private static void Read(TsavoriteKV store)
}
}
- private async ValueTask Recover(TsavoriteKV store, bool isAsync = false)
+ private async ValueTask Recover(TsavoriteKV store)
where TStoreFunctions : IStoreFunctions
where TAllocator : IAllocator
{
- if (isAsync)
- _ = await store.RecoverAsync(indexToken, logToken).ConfigureAwait(false);
- else
- _ = store.Recover(indexToken, logToken);
+ _ = await store.RecoverAsync(indexToken, logToken).ConfigureAwait(false);
}
}
}
\ No newline at end of file
diff --git a/libs/storage/Tsavorite/cs/test/test.recovery/SimpleRecoveryTest.cs b/libs/storage/Tsavorite/cs/test/test.recovery/SimpleRecoveryTest.cs
index 487e0e0b5a3..380b6537446 100644
--- a/libs/storage/Tsavorite/cs/test/test.recovery/SimpleRecoveryTest.cs
+++ b/libs/storage/Tsavorite/cs/test/test.recovery/SimpleRecoveryTest.cs
@@ -81,14 +81,14 @@ public void TearDown()
[Category("TsavoriteKV"), Category("CheckpointRestore")]
public async ValueTask PageBlobSimpleRecoveryTest(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values] CompletionSyncMode completionSyncMode, [Values] bool testCommitCookie)
+ [Values] bool testCommitCookie)
{
IgnoreIfNotRunningAzureTests();
checkpointManager = new CheckpointManagerWithCookie(
testCommitCookie,
TestUtils.AzureStorageNamedDeviceFactoryCreator,
new AzureCheckpointNamingScheme($"{AzureTestContainer}/{AzureTestDirectory}"));
- await SimpleRecoveryTest1_Worker(checkpointType, completionSyncMode, testCommitCookie).ConfigureAwait(false);
+ await SimpleRecoveryTest1_Worker(checkpointType, testCommitCookie).ConfigureAwait(false);
checkpointManager.PurgeAll();
}
@@ -99,18 +99,17 @@ public async ValueTask PageBlobSimpleRecoveryTest(
public async ValueTask LocalDeviceSimpleRecoveryTest(
[Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values] CompletionSyncMode completionSyncMode,
[Values] bool testCommitCookie)
{
checkpointManager = new CheckpointManagerWithCookie(
testCommitCookie,
new LocalStorageNamedDeviceFactoryCreator(),
new DefaultCheckpointNamingScheme(Path.Join(MethodTestDir, "chkpt")));
- await SimpleRecoveryTest1_Worker(checkpointType, completionSyncMode, testCommitCookie).ConfigureAwait(false);
+ await SimpleRecoveryTest1_Worker(checkpointType, testCommitCookie).ConfigureAwait(false);
checkpointManager.PurgeAll();
}
- private async ValueTask SimpleRecoveryTest1_Worker(CheckpointType checkpointType, CompletionSyncMode completionSyncMode, bool testCommitCookie)
+ private async ValueTask SimpleRecoveryTest1_Worker(CheckpointType checkpointType, bool testCommitCookie)
{
log = Devices.CreateLogDevice(Path.Join(MethodTestDir, "SimpleRecoveryTest1.log"), deleteOnClose: true);
@@ -152,16 +151,10 @@ private async ValueTask SimpleRecoveryTest1_Worker(CheckpointType checkpointType
}
_ = store1.TryInitiateFullCheckpoint(out Guid token, checkpointType);
- if (completionSyncMode == CompletionSyncMode.Sync)
- store1.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult();
- else
- await store1.CompleteCheckpointAsync().ConfigureAwait(false);
+ await store1.CompleteCheckpointAsync().ConfigureAwait(false);
session1.Dispose();
- if (completionSyncMode == CompletionSyncMode.Sync)
- _ = store2.Recover(token);
- else
- _ = await store2.RecoverAsync(token).ConfigureAwait(false);
+ _ = await store2.RecoverAsync(token).ConfigureAwait(false);
if (testCommitCookie)
ClassicAssert.IsTrue(store2.RecoveredCommitCookie.SequenceEqual(checkpointManager.Cookie));
@@ -194,8 +187,7 @@ private async ValueTask SimpleRecoveryTest1_Worker(CheckpointType checkpointType
[Test]
[Category("TsavoriteKV"), Category("CheckpointRestore")]
public async ValueTask SimpleRecoveryTest2(
- [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType,
- [Values] CompletionSyncMode completionSyncMode)
+ [Values(CheckpointType.Snapshot, CheckpointType.FoldOver)] CheckpointType checkpointType)
{
checkpointManager = new CheckpointManagerWithCookie(false, new LocalStorageNamedDeviceFactoryCreator(), new DefaultCheckpointNamingScheme(Path.Join(MethodTestDir, "checkpoints4")), false);
log = Devices.CreateLogDevice(Path.Join(MethodTestDir, "SimpleRecoveryTest2.log"), deleteOnClose: true);
@@ -235,13 +227,10 @@ public async ValueTask SimpleRecoveryTest2(
_ = bContext1.Upsert(inputArray[key], SpanByte.FromPinnedVariable(ref value), Empty.Default);
}
_ = store1.TryInitiateFullCheckpoint(out Guid token, checkpointType);
- store1.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult();
+ await store1.CompleteCheckpointAsync().ConfigureAwait(false);
session1.Dispose();
- if (completionSyncMode == CompletionSyncMode.Sync)
- _ = store2.Recover(token);
- else
- _ = await store2.RecoverAsync(token).ConfigureAwait(false);
+ _ = await store2.RecoverAsync(token).ConfigureAwait(false);
var session2 = store2.NewSession(new AdSimpleFunctions());
var bContext2 = session1.BasicContext;
@@ -260,7 +249,7 @@ public async ValueTask SimpleRecoveryTest2(
[Test]
[Category("TsavoriteKV"), Category("CheckpointRestore")]
- public async ValueTask ShouldRecoverBeginAddress([Values] CompletionSyncMode completionSyncMode)
+ public async ValueTask ShouldRecoverBeginAddress()
{
log = Devices.CreateLogDevice(Path.Join(MethodTestDir, "SimpleRecoveryTest2.log"), deleteOnClose: true);
checkpointDir = Path.Join(MethodTestDir, "checkpoints6");
@@ -305,23 +294,17 @@ public async ValueTask ShouldRecoverBeginAddress([Values] CompletionSyncMode com
store1.Log.ShiftBeginAddress(address);
_ = store1.TryInitiateFullCheckpoint(out Guid token, CheckpointType.FoldOver);
- if (completionSyncMode == CompletionSyncMode.Sync)
- store1.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult();
- else
- await store1.CompleteCheckpointAsync().ConfigureAwait(false);
+ await store1.CompleteCheckpointAsync().ConfigureAwait(false);
session1.Dispose();
- if (completionSyncMode == CompletionSyncMode.Sync)
- _ = store2.Recover(token);
- else
- _ = await store2.RecoverAsync(token).ConfigureAwait(false);
+ _ = await store2.RecoverAsync(token).ConfigureAwait(false);
ClassicAssert.AreEqual(address, store2.Log.BeginAddress);
}
[Test]
[Category("TsavoriteKV"), Category("CheckpointRestore")]
- public async ValueTask SimpleReadAndUpdateInfoTest([Values] CompletionSyncMode completionSyncMode)
+ public async ValueTask SimpleReadAndUpdateInfoTest()
{
checkpointManager = new CheckpointManagerWithCookie(false, new LocalStorageNamedDeviceFactoryCreator(), new DefaultCheckpointNamingScheme(Path.Join(MethodTestDir, "checkpoints")), false);
log = Devices.CreateLogDevice(Path.Join(MethodTestDir, "SimpleReadAndUpdateInfoTest.log"), deleteOnClose: true);
@@ -369,16 +352,10 @@ public async ValueTask SimpleReadAndUpdateInfoTest([Values] CompletionSyncMode c
}
}
_ = store1.TryInitiateFullCheckpoint(out Guid token, CheckpointType.FoldOver);
- if (completionSyncMode == CompletionSyncMode.Sync)
- store1.CompleteCheckpointAsync().AsTask().GetAwaiter().GetResult();
- else
- await store1.CompleteCheckpointAsync().ConfigureAwait(false);
+ await store1.CompleteCheckpointAsync().ConfigureAwait(false);
session1.Dispose();
- if (completionSyncMode == CompletionSyncMode.Sync)
- _ = store2.Recover(token);
- else
- _ = await store2.RecoverAsync(token).ConfigureAwait(false);
+ _ = await store2.RecoverAsync(token).ConfigureAwait(false);
var session2 = store2.NewSession(functions2);
var bContext2 = session2.BasicContext;
diff --git a/libs/storage/Tsavorite/cs/test/test.session.context/TransactionalUnsafeContextTests.cs b/libs/storage/Tsavorite/cs/test/test.session.context/TransactionalUnsafeContextTests.cs
index 294fe36bb92..9923a46aa38 100644
--- a/libs/storage/Tsavorite/cs/test/test.session.context/TransactionalUnsafeContextTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.session.context/TransactionalUnsafeContextTests.cs
@@ -358,7 +358,7 @@ public void ManualLockCollidingHashCodes([Values] UseSingleBucketComparer /* jus
[Test]
[Category("TsavoriteKV")]
[Category("Smoke")]
- public async Task TestShiftHeadAddressLUC([Values] CompletionSyncMode syncMode)
+ public async Task TestShiftHeadAddressLUC()
{
long input = 0;
const int RandSeed = 10;
@@ -412,16 +412,9 @@ public async Task TestShiftHeadAddressLUC([Values] CompletionSyncMode syncMode)
AssertTotalLockCounts(0, 0);
- if (syncMode == CompletionSyncMode.Sync)
- {
- _ = luContext.CompletePending(true);
- }
- else
- {
- luContext.EndUnsafe();
- await luContext.CompletePendingAsync().ConfigureAwait(false);
- luContext.BeginUnsafe();
- }
+ luContext.EndUnsafe();
+ await luContext.CompletePendingAsync().ConfigureAwait(false);
+ luContext.BeginUnsafe();
// Shift head and retry - should not find in main memory now
store.Log.FlushAndEvict(true);
@@ -458,17 +451,9 @@ public async Task TestShiftHeadAddressLUC([Values] CompletionSyncMode syncMode)
// We did not lock all keys, only the "Action" ones - one lock per bucket, all shared in this test
AssertTotalLockCounts(0, expectedS);
- CompletedOutputIterator outputs;
- if (syncMode == CompletionSyncMode.Sync)
- {
- _ = luContext.CompletePendingWithOutputs(out outputs, wait: true);
- }
- else
- {
- luContext.EndUnsafe();
- outputs = await luContext.CompletePendingWithOutputsAsync().ConfigureAwait(false);
- luContext.BeginUnsafe();
- }
+ luContext.EndUnsafe();
+ var outputs = await luContext.CompletePendingWithOutputsAsync().ConfigureAwait(false);
+ luContext.BeginUnsafe();
foreach (var idx in EnumActionKeyIndices(lockKeys, LockOperationType.Unlock))
{
diff --git a/libs/storage/Tsavorite/cs/test/test.session.context/UnsafeContextTests.cs b/libs/storage/Tsavorite/cs/test/test.session.context/UnsafeContextTests.cs
index 14b8e9f7812..061161ed337 100644
--- a/libs/storage/Tsavorite/cs/test/test.session.context/UnsafeContextTests.cs
+++ b/libs/storage/Tsavorite/cs/test/test.session.context/UnsafeContextTests.cs
@@ -267,7 +267,7 @@ public unsafe void NativeInMemWriteRead2()
[Test]
[Category("TsavoriteKV")]
[Category("Smoke")]
- public async Task TestShiftHeadAddressUC([Values] TestDeviceType deviceType, [Values] CompletionSyncMode syncMode)
+ public async Task TestShiftHeadAddressUC([Values] TestDeviceType deviceType)
{
InputStruct input = default;
const int RandSeed = 10;
@@ -307,16 +307,9 @@ public async Task TestShiftHeadAddressUC([Values] TestDeviceType deviceType, [Va
ClassicAssert.AreEqual(value.vfield2, output.value.vfield2);
}
}
- if (syncMode == CompletionSyncMode.Sync)
- {
- _ = uContext.CompletePending(true);
- }
- else
- {
- uContext.EndUnsafe();
- await uContext.CompletePendingAsync().ConfigureAwait(false);
- uContext.BeginUnsafe();
- }
+ uContext.EndUnsafe();
+ await uContext.CompletePendingAsync().ConfigureAwait(false);
+ uContext.BeginUnsafe();
// Shift head and retry - should not find in main memory now
store.Log.FlushAndEvict(true);
@@ -333,17 +326,9 @@ public async Task TestShiftHeadAddressUC([Values] TestDeviceType deviceType, [Va
ClassicAssert.IsTrue(foundStatus.IsPending);
}
- CompletedOutputIterator outputs;
- if (syncMode == CompletionSyncMode.Sync)
- {
- _ = uContext.CompletePendingWithOutputs(out outputs, wait: true);
- }
- else
- {
- uContext.EndUnsafe();
- outputs = await uContext.CompletePendingWithOutputsAsync().ConfigureAwait(false);
- uContext.BeginUnsafe();
- }
+ uContext.EndUnsafe();
+ var outputs = await uContext.CompletePendingWithOutputsAsync().ConfigureAwait(false);
+ uContext.BeginUnsafe();
int count = 0;
while (outputs.Next())
diff --git a/test/standalone/Garnet.test.collections/GarnetObjectTests.cs b/test/standalone/Garnet.test.collections/GarnetObjectTests.cs
index 1acdc34b459..224d6cd4a98 100644
--- a/test/standalone/Garnet.test.collections/GarnetObjectTests.cs
+++ b/test/standalone/Garnet.test.collections/GarnetObjectTests.cs
@@ -63,7 +63,7 @@ public async Task WriteCheckpointRead()
_ = await store.TakeHybridLogCheckpointAsync(CheckpointType.FoldOver).ConfigureAwait(false);
store.Dispose();
CreateStore();
- _ = store.Recover();
+ _ = await store.RecoverAsync().ConfigureAwait(false);
LocalRead();
void LocalWrite()
@@ -100,7 +100,7 @@ public async Task WriteCheckpointCopyUpdate()
_ = await store.TakeHybridLogCheckpointAsync(CheckpointType.FoldOver).ConfigureAwait(false);
store.Dispose();
CreateStore();
- _ = store.Recover();
+ _ = await store.RecoverAsync().ConfigureAwait(false);
LocalRead();
void LocalWrite()
diff --git a/test/standalone/Garnet.test/RespConfigTests.cs b/test/standalone/Garnet.test/RespConfigTests.cs
index 2a03faec1c2..06bb5221cc6 100644
--- a/test/standalone/Garnet.test/RespConfigTests.cs
+++ b/test/standalone/Garnet.test/RespConfigTests.cs
@@ -713,8 +713,8 @@ public void ConfigSetHeapMemorySizeUtilizationTest(int smallerSize)
// Sanity-check the preconditions for the shrink/eviction we are about to trigger.
var apcBefore = store.Log.AllocatedPageCount;
var heapBefore = tracker.LogHeapSizeBytes;
- Assert.That(apcBefore, Is.GreaterThan(LogSizeTracker.MinResizeTargetPageCount),
- "Test precondition: need more than MinResizeTargetPageCount pages for eviction to be possible.");
+ Assert.That(apcBefore, Is.GreaterThan(1),
+ "Test precondition: need more than one page for eviction to be possible.");
Assert.That(heapBefore, Is.GreaterThan(0), "Test precondition: heap should be non-empty after inserts.");
using var trimCompleteEvent = new ManualResetEventSlim(false);