Fixed site discovery process that did not return all the sites

This commit is contained in:
2026-06-04 11:32:34 +02:00
parent 95aca34b31
commit ce2f0512cb
10 changed files with 315 additions and 58 deletions
+103 -42
View File
@@ -1,23 +1,31 @@
using Microsoft.Graph;
using Microsoft.Graph.Models;
using Microsoft.Kiota.Abstractions;
using Microsoft.Online.SharePoint.TenantAdministration;
using Microsoft.SharePoint.Client;
using SharepointToolbox.Web.Core.Helpers;
using SharepointToolbox.Web.Core.Models;
using AppGraphClientFactory = SharepointToolbox.Web.Infrastructure.Auth.GraphClientFactory;
using SharepointToolbox.Web.Services.Session;
namespace SharepointToolbox.Web.Services;
/// <summary>
/// Delegated Graph implementation of <see cref="ISiteDiscoveryService"/>.
/// Uses the <c>/sites?search=*</c> endpoint, paging through every result.
/// Requires the delegated <c>Sites.Read.All</c> scope.
/// Delegated CSOM implementation of <see cref="ISiteDiscoveryService"/>.
///
/// Enumerates every site collection via the SharePoint tenant admin endpoint
/// (<c>Tenant.GetSitePropertiesFromSharePointByFilters</c>), paging through all
/// results. Requires the signed-in user to be a SharePoint administrator.
///
/// The Graph <c>/sites?search=*</c> endpoint was deliberately abandoned: it ranks
/// by relevance and is capped server-side, so it silently dropped sites and
/// returned varying counts run-to-run. <c>/sites/getAllSites</c> is app-only and
/// 403s on a delegated user token. The tenant admin enumeration is the only path
/// that returns the complete, stable set under the app's delegated auth model.
/// </summary>
public class SiteDiscoveryService : ISiteDiscoveryService
{
private readonly AppGraphClientFactory _graphClientFactory;
private readonly ISessionManager _sessionManager;
public SiteDiscoveryService(AppGraphClientFactory graphClientFactory)
public SiteDiscoveryService(ISessionManager sessionManager)
{
_graphClientFactory = graphClientFactory;
_sessionManager = sessionManager;
}
public async Task<IReadOnlyList<SiteInfo>> SearchSitesAsync(
@@ -25,47 +33,52 @@ public class SiteDiscoveryService : ISiteDiscoveryService
string? query = null,
CancellationToken ct = default)
{
var graphClient = await _graphClientFactory.CreateClientAsync(profile);
// "*" is the Graph convention for "return all sites".
var search = string.IsNullOrWhiteSpace(query) ? "*" : query!;
ArgumentException.ThrowIfNullOrEmpty(profile.TenantUrl);
// The typed Sites.GetAsync maps its Search property to OData "$search",
// which routes "*" through KQL and fails ("'*' is not valid at position 0").
// The all-sites wildcard only works via the bare, non-OData "search"
// query parameter, so build the request manually.
var requestInfo = new RequestInformation
// Site enumeration only exists on the tenant admin endpoint.
var adminProfile = new TenantProfile
{
HttpMethod = Method.GET,
UrlTemplate = "{+baseurl}/sites{?search,%24top}",
PathParameters = new Dictionary<string, object>
{
{ "baseurl", graphClient.RequestAdapter.BaseUrl ?? "https://graph.microsoft.com/v1.0" }
},
Id = profile.Id,
Name = profile.Name,
TenantUrl = BuildAdminUrl(profile.TenantUrl),
TenantId = profile.TenantId,
ClientId = profile.ClientId,
ClientLogo = profile.ClientLogo,
};
requestInfo.QueryParameters.Add("search", search);
requestInfo.QueryParameters.Add("%24top", 999);
requestInfo.Headers.Add("Accept", "application/json");
var response = await graphClient.RequestAdapter.SendAsync<SiteCollectionResponse>(
requestInfo, SiteCollectionResponse.CreateFromDiscriminatorValue, cancellationToken: ct);
var ctx = await _sessionManager.GetOrCreateContextAsync(adminProfile, ct);
var tenant = new Tenant(ctx);
if (response is null) return Array.Empty<SiteInfo>();
var filter = new SPOSitePropertiesEnumerableFilter
{
IncludeDetail = false,
IncludePersonalSite = PersonalSiteFilter.Exclude,
StartIndex = null,
Template = null,
};
var results = new List<SiteInfo>();
var iter = PageIterator<Site, SiteCollectionResponse>.CreatePageIterator(
graphClient, response,
site =>
SPOSitePropertiesEnumerable page;
do
{
ct.ThrowIfCancellationRequested();
page = await FetchPageWithColdTokenRetryAsync(ctx, tenant, filter, ct);
foreach (var sp in page)
{
if (ct.IsCancellationRequested) return false;
var url = site.WebUrl ?? string.Empty;
if (string.IsNullOrEmpty(url)) return true;
// Skip OneDrive personal sites — not useful for these scans.
if (url.Contains("/personal/", StringComparison.OrdinalIgnoreCase)) return true;
var title = site.DisplayName ?? site.Name ?? url;
var url = sp.Url ?? string.Empty;
if (string.IsNullOrEmpty(url)) continue;
// Belt-and-braces: PersonalSiteFilter.Exclude already drops OneDrive.
if (url.Contains("/personal/", StringComparison.OrdinalIgnoreCase)) continue;
var title = string.IsNullOrEmpty(sp.Title) ? url : sp.Title;
results.Add(new SiteInfo(url, title));
return true;
});
await iter.IterateAsync(ct);
}
// NextStartIndexFromSharePoint is empty/null once the last page is returned.
filter.StartIndex = page.NextStartIndexFromSharePoint;
}
while (!string.IsNullOrEmpty(filter.StartIndex));
return results
.GroupBy(s => s.Url, StringComparer.OrdinalIgnoreCase)
@@ -73,4 +86,52 @@ public class SiteDiscoveryService : ISiteDiscoveryService
.OrderBy(s => s.Title, StringComparer.OrdinalIgnoreCase)
.ToList();
}
private const int MaxColdTokenAttempts = 4;
private const int BackoffBaseSeconds = 3;
// The tenant admin endpoint transiently 403s on a cold delegated token (the same
// behaviour the elevation coordinator handles): the first call against the admin
// host can be denied while the token warms, then clears within seconds. Retry the
// admin query on access-denied with backoff. A genuine lack of SharePoint tenant
// administrator rights keeps failing and surfaces the enriched 403 after retries —
// elevation cannot self-grant tenant-admin, so there is nothing to auto-correct.
//
// The request (GetSiteProperties + Load) MUST be re-issued inside the loop: a failed
// CSOM ExecuteQuery clears the context's pending-request queue, so retrying the
// execute alone would run an empty batch, leave the page uninitialized, and throw
// "The collection has not been initialized" on iteration.
private static async Task<SPOSitePropertiesEnumerable> FetchPageWithColdTokenRetryAsync(
ClientContext ctx, Tenant tenant, SPOSitePropertiesEnumerableFilter filter, CancellationToken ct)
{
for (int attempt = 1; ; attempt++)
{
try
{
var page = tenant.GetSitePropertiesFromSharePointByFilters(filter);
ctx.Load(page);
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, null, ct);
return page;
}
catch (SharePointAccessDeniedException ex) when (attempt < MaxColdTokenAttempts)
{
var delay = TimeSpan.FromSeconds(BackoffBaseSeconds * attempt);
Serilog.Log.Warning(
"Tenant admin endpoint denied during site discovery (attempt {N}/{Max}); " +
"retrying in {Delay}s. {Err}",
attempt, MaxColdTokenAttempts, delay.TotalSeconds, ex.Message);
await Task.Delay(delay, ct);
}
}
}
// https://contoso.sharepoint.com[/sites/Foo] → https://contoso-admin.sharepoint.com
private static string BuildAdminUrl(string tenantUrl)
{
if (!Uri.TryCreate(tenantUrl, UriKind.Absolute, out var uri))
return tenantUrl;
var adminHost = uri.Host.Replace(".sharepoint.com", "-admin.sharepoint.com",
StringComparison.OrdinalIgnoreCase);
return $"{uri.Scheme}://{adminHost}";
}
}