From 881f3a8bac74b73a10553eee6616936a9bea696f Mon Sep 17 00:00:00 2001 From: kawa Date: Tue, 2 Jun 2026 14:39:29 +0200 Subject: [PATCH] Add backoff-retry to elevation for transient admin 403 and grant lag Logs showed the failure was a transient 403 on the tenant admin endpoint (loading CurrentUser on -admin.sharepoint.com returned E_ACCESSDENIED on a cold token), and that re-running the operation a few seconds later succeeded. The site-collection admin grant is also eventually consistent on Group/Teams sites, taking a few seconds to reach the content endpoint. Retry both stages with backoff (3s, 6s, 9s; 4 attempts) instead of failing on the first denial: - ElevateAsync retries the admin-endpoint grant on transient access-denied; a genuine lack of tenant-admin rights still surfaces after retries exhaust. - After a successful grant, the post-elevation operation retries on continued access-denied to absorb grant-propagation lag. Co-Authored-By: Claude Opus 4.8 (1M context) --- Services/ElevationCoordinator.cs | 65 ++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/Services/ElevationCoordinator.cs b/Services/ElevationCoordinator.cs index 3224541..71740ec 100644 --- a/Services/ElevationCoordinator.cs +++ b/Services/ElevationCoordinator.cs @@ -12,6 +12,10 @@ namespace SharepointToolbox.Web.Services; /// Retry is safe because the wrapped operation closure re-issues its own CSOM loads on each /// attempt; the granted permission is server-side and takes effect for the existing delegated /// token without re-authentication. Each site is elevated at most once per circuit to prevent loops. +/// +/// Both the admin-endpoint grant and the post-grant operation are retried with backoff: the +/// tenant admin endpoint can transiently 403 on a cold token, and the site-collection admin grant +/// is eventually consistent (notably on Group/Teams-connected sites), taking a few seconds to apply. /// public class ElevationCoordinator : IElevationCoordinator { @@ -62,11 +66,28 @@ public class ElevationCoordinator : IElevationCoordinator // so the logs distinguish "grant failed/no-op" from "scan still fails for another reason". await VerifyAdminAsync(siteUrl, ct); - // Re-run once. The closure re-issues its loads; the now-granted admin right applies. - return await operation(ct); + // The site-collection admin grant is eventually consistent — on Group/Teams sites it + // can take a few seconds to propagate to the content endpoint. Retry with backoff. + for (int attempt = 1; ; attempt++) + { + try + { + return await operation(ct); + } + catch (SharePointAccessDeniedException) when (attempt < MaxBackoffAttempts) + { + var delay = TimeSpan.FromSeconds(BackoffBaseSeconds * attempt); + Log.Warning("Post-elevation scan still denied for {Site} (attempt {N}/{Max}); retrying in {Delay}s.", + siteUrl, attempt, MaxBackoffAttempts, delay.TotalSeconds); + await Task.Delay(delay, ct); + } + } } } + private const int MaxBackoffAttempts = 4; + private const int BackoffBaseSeconds = 3; + private async Task ElevateAsync(string siteUrl, CancellationToken ct) { var profile = _session.CurrentProfile @@ -82,20 +103,34 @@ public class ElevationCoordinator : IElevationCoordinator ClientLogo = profile.ClientLogo, }; - try + var adminCtx = await _sessionManager.GetOrCreateContextAsync(adminProfile, ct); + Log.Information("Auto-elevating site-collection admin ownership for {Site} via {Admin}", + siteUrl, adminProfile.TenantUrl); + + for (int attempt = 1; ; attempt++) { - var adminCtx = await _sessionManager.GetOrCreateContextAsync(adminProfile, ct); - Log.Information("Auto-elevating site-collection admin ownership for {Site} via {Admin}", - siteUrl, adminProfile.TenantUrl); - // loginName empty → ElevateAsync resolves the current (delegated) user from the admin context. - await _ownership.ElevateAsync(adminCtx, siteUrl, loginName: string.Empty, ct); - } - catch (Exception ex) when (ex is not OperationCanceledException) - { - Log.Error(ex, "Auto-elevate ownership failed for {Site}", siteUrl); - throw new InvalidOperationException( - $"Auto-elevate ownership failed for {siteUrl}. Granting site-collection admin requires " + - $"SharePoint tenant administrator rights on the signed-in account. ({ex.Message})", ex); + try + { + // loginName empty → ElevateAsync resolves the current (delegated) user from the admin context. + await _ownership.ElevateAsync(adminCtx, siteUrl, loginName: string.Empty, ct); + return; + } + // The admin endpoint can transiently 403 on a cold token / first call; it clears within + // seconds. A genuine lack of tenant-admin rights keeps failing and surfaces after retries. + catch (SharePointAccessDeniedException ex) when (attempt < MaxBackoffAttempts) + { + var delay = TimeSpan.FromSeconds(BackoffBaseSeconds * attempt); + Log.Warning("Admin endpoint denied for {Site} (attempt {N}/{Max}); retrying in {Delay}s. {Err}", + siteUrl, attempt, MaxBackoffAttempts, delay.TotalSeconds, ex.Message); + await Task.Delay(delay, ct); + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + Log.Error(ex, "Auto-elevate ownership failed for {Site}", siteUrl); + throw new InvalidOperationException( + $"Auto-elevate ownership failed for {siteUrl}. Granting site-collection admin requires " + + $"SharePoint tenant administrator rights on the signed-in account. ({ex.Message})", ex); + } } }