Add backoff-retry to elevation for transient admin 403 and grant lag

Logs showed the failure was a transient 403 on the tenant admin endpoint
(loading CurrentUser on -admin.sharepoint.com returned E_ACCESSDENIED on a
cold token), and that re-running the operation a few seconds later succeeded.
The site-collection admin grant is also eventually consistent on Group/Teams
sites, taking a few seconds to reach the content endpoint.

Retry both stages with backoff (3s, 6s, 9s; 4 attempts) instead of failing
on the first denial:
- ElevateAsync retries the admin-endpoint grant on transient access-denied; a
  genuine lack of tenant-admin rights still surfaces after retries exhaust.
- After a successful grant, the post-elevation operation retries on continued
  access-denied to absorb grant-propagation lag.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-02 14:39:29 +02:00
parent e4125c6643
commit 881f3a8bac
+50 -15
View File
@@ -12,6 +12,10 @@ namespace SharepointToolbox.Web.Services;
/// Retry is safe because the wrapped operation closure re-issues its own CSOM loads on each
/// attempt; the granted permission is server-side and takes effect for the existing delegated
/// token without re-authentication. Each site is elevated at most once per circuit to prevent loops.
///
/// Both the admin-endpoint grant and the post-grant operation are retried with backoff: the
/// tenant admin endpoint can transiently 403 on a cold token, and the site-collection admin grant
/// is eventually consistent (notably on Group/Teams-connected sites), taking a few seconds to apply.
/// </summary>
public class ElevationCoordinator : IElevationCoordinator
{
@@ -62,11 +66,28 @@ public class ElevationCoordinator : IElevationCoordinator
// so the logs distinguish "grant failed/no-op" from "scan still fails for another reason".
await VerifyAdminAsync(siteUrl, ct);
// Re-run once. The closure re-issues its loads; the now-granted admin right applies.
return await operation(ct);
// The site-collection admin grant is eventually consistent — on Group/Teams sites it
// can take a few seconds to propagate to the content endpoint. Retry with backoff.
for (int attempt = 1; ; attempt++)
{
try
{
return await operation(ct);
}
catch (SharePointAccessDeniedException) when (attempt < MaxBackoffAttempts)
{
var delay = TimeSpan.FromSeconds(BackoffBaseSeconds * attempt);
Log.Warning("Post-elevation scan still denied for {Site} (attempt {N}/{Max}); retrying in {Delay}s.",
siteUrl, attempt, MaxBackoffAttempts, delay.TotalSeconds);
await Task.Delay(delay, ct);
}
}
}
}
private const int MaxBackoffAttempts = 4;
private const int BackoffBaseSeconds = 3;
private async Task ElevateAsync(string siteUrl, CancellationToken ct)
{
var profile = _session.CurrentProfile
@@ -82,20 +103,34 @@ public class ElevationCoordinator : IElevationCoordinator
ClientLogo = profile.ClientLogo,
};
try
var adminCtx = await _sessionManager.GetOrCreateContextAsync(adminProfile, ct);
Log.Information("Auto-elevating site-collection admin ownership for {Site} via {Admin}",
siteUrl, adminProfile.TenantUrl);
for (int attempt = 1; ; attempt++)
{
var adminCtx = await _sessionManager.GetOrCreateContextAsync(adminProfile, ct);
Log.Information("Auto-elevating site-collection admin ownership for {Site} via {Admin}",
siteUrl, adminProfile.TenantUrl);
// loginName empty → ElevateAsync resolves the current (delegated) user from the admin context.
await _ownership.ElevateAsync(adminCtx, siteUrl, loginName: string.Empty, ct);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
Log.Error(ex, "Auto-elevate ownership failed for {Site}", siteUrl);
throw new InvalidOperationException(
$"Auto-elevate ownership failed for {siteUrl}. Granting site-collection admin requires " +
$"SharePoint tenant administrator rights on the signed-in account. ({ex.Message})", ex);
try
{
// loginName empty → ElevateAsync resolves the current (delegated) user from the admin context.
await _ownership.ElevateAsync(adminCtx, siteUrl, loginName: string.Empty, ct);
return;
}
// The admin endpoint can transiently 403 on a cold token / first call; it clears within
// seconds. A genuine lack of tenant-admin rights keeps failing and surfaces after retries.
catch (SharePointAccessDeniedException ex) when (attempt < MaxBackoffAttempts)
{
var delay = TimeSpan.FromSeconds(BackoffBaseSeconds * attempt);
Log.Warning("Admin endpoint denied for {Site} (attempt {N}/{Max}); retrying in {Delay}s. {Err}",
siteUrl, attempt, MaxBackoffAttempts, delay.TotalSeconds, ex.Message);
await Task.Delay(delay, ct);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
Log.Error(ex, "Auto-elevate ownership failed for {Site}", siteUrl);
throw new InvalidOperationException(
$"Auto-elevate ownership failed for {siteUrl}. Granting site-collection admin requires " +
$"SharePoint tenant administrator rights on the signed-in account. ({ex.Message})", ex);
}
}
}