Add backoff-retry to elevation for transient admin 403 and grant lag

Logs showed the failure was a transient 403 on the tenant admin endpoint
(loading CurrentUser on -admin.sharepoint.com returned E_ACCESSDENIED on a
cold token), and that re-running the operation a few seconds later succeeded.
The site-collection admin grant is also eventually consistent on Group/Teams
sites, taking a few seconds to reach the content endpoint.

Retry both stages with backoff (3s, 6s, 9s; 4 attempts) instead of failing
on the first denial:
- ElevateAsync retries the admin-endpoint grant on transient access-denied; a
  genuine lack of tenant-admin rights still surfaces after retries exhaust.
- After a successful grant, the post-elevation operation retries on continued
  access-denied to absorb grant-propagation lag.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-02 14:39:29 +02:00
parent e4125c6643
commit 881f3a8bac
+38 -3
View File
@@ -12,6 +12,10 @@ namespace SharepointToolbox.Web.Services;
/// Retry is safe because the wrapped operation closure re-issues its own CSOM loads on each /// Retry is safe because the wrapped operation closure re-issues its own CSOM loads on each
/// attempt; the granted permission is server-side and takes effect for the existing delegated /// attempt; the granted permission is server-side and takes effect for the existing delegated
/// token without re-authentication. Each site is elevated at most once per circuit to prevent loops. /// token without re-authentication. Each site is elevated at most once per circuit to prevent loops.
///
/// Both the admin-endpoint grant and the post-grant operation are retried with backoff: the
/// tenant admin endpoint can transiently 403 on a cold token, and the site-collection admin grant
/// is eventually consistent (notably on Group/Teams-connected sites), taking a few seconds to apply.
/// </summary> /// </summary>
public class ElevationCoordinator : IElevationCoordinator public class ElevationCoordinator : IElevationCoordinator
{ {
@@ -62,10 +66,27 @@ public class ElevationCoordinator : IElevationCoordinator
// so the logs distinguish "grant failed/no-op" from "scan still fails for another reason". // so the logs distinguish "grant failed/no-op" from "scan still fails for another reason".
await VerifyAdminAsync(siteUrl, ct); await VerifyAdminAsync(siteUrl, ct);
// Re-run once. The closure re-issues its loads; the now-granted admin right applies. // The site-collection admin grant is eventually consistent — on Group/Teams sites it
// can take a few seconds to propagate to the content endpoint. Retry with backoff.
for (int attempt = 1; ; attempt++)
{
try
{
return await operation(ct); return await operation(ct);
} }
catch (SharePointAccessDeniedException) when (attempt < MaxBackoffAttempts)
{
var delay = TimeSpan.FromSeconds(BackoffBaseSeconds * attempt);
Log.Warning("Post-elevation scan still denied for {Site} (attempt {N}/{Max}); retrying in {Delay}s.",
siteUrl, attempt, MaxBackoffAttempts, delay.TotalSeconds);
await Task.Delay(delay, ct);
} }
}
}
}
private const int MaxBackoffAttempts = 4;
private const int BackoffBaseSeconds = 3;
private async Task ElevateAsync(string siteUrl, CancellationToken ct) private async Task ElevateAsync(string siteUrl, CancellationToken ct)
{ {
@@ -82,13 +103,26 @@ public class ElevationCoordinator : IElevationCoordinator
ClientLogo = profile.ClientLogo, ClientLogo = profile.ClientLogo,
}; };
try
{
var adminCtx = await _sessionManager.GetOrCreateContextAsync(adminProfile, ct); var adminCtx = await _sessionManager.GetOrCreateContextAsync(adminProfile, ct);
Log.Information("Auto-elevating site-collection admin ownership for {Site} via {Admin}", Log.Information("Auto-elevating site-collection admin ownership for {Site} via {Admin}",
siteUrl, adminProfile.TenantUrl); siteUrl, adminProfile.TenantUrl);
for (int attempt = 1; ; attempt++)
{
try
{
// loginName empty → ElevateAsync resolves the current (delegated) user from the admin context. // loginName empty → ElevateAsync resolves the current (delegated) user from the admin context.
await _ownership.ElevateAsync(adminCtx, siteUrl, loginName: string.Empty, ct); await _ownership.ElevateAsync(adminCtx, siteUrl, loginName: string.Empty, ct);
return;
}
// The admin endpoint can transiently 403 on a cold token / first call; it clears within
// seconds. A genuine lack of tenant-admin rights keeps failing and surfaces after retries.
catch (SharePointAccessDeniedException ex) when (attempt < MaxBackoffAttempts)
{
var delay = TimeSpan.FromSeconds(BackoffBaseSeconds * attempt);
Log.Warning("Admin endpoint denied for {Site} (attempt {N}/{Max}); retrying in {Delay}s. {Err}",
siteUrl, attempt, MaxBackoffAttempts, delay.TotalSeconds, ex.Message);
await Task.Delay(delay, ct);
} }
catch (Exception ex) when (ex is not OperationCanceledException) catch (Exception ex) when (ex is not OperationCanceledException)
{ {
@@ -98,6 +132,7 @@ public class ElevationCoordinator : IElevationCoordinator
$"SharePoint tenant administrator rights on the signed-in account. ({ex.Message})", ex); $"SharePoint tenant administrator rights on the signed-in account. ({ex.Message})", ex);
} }
} }
}
// Reads the current user's site-admin flag on the target site right after elevation. // Reads the current user's site-admin flag on the target site right after elevation.
// Diagnostic only — never throws into the operation flow. // Diagnostic only — never throws into the operation flow.