Files
Sharepoint-Toolbox/SharepointToolbox/Services/DuplicatesService.cs
T
Dev 12dd1de9f2 chore: release v2.4
- Add theme system (Dark/Light palettes, ModernTheme, ThemeManager)
- Add InputDialog, Spinner common view
- Add DuplicatesCsvExportService
- Refresh views, dialogs, and view models across tabs
- Update localization strings (en/fr)
- Tweak services (transfer, permissions, search, user access, ownership elevation, bulk operations)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 10:50:03 +02:00

340 lines
14 KiB
C#

using System.Diagnostics;
using Microsoft.SharePoint.Client;
using Microsoft.SharePoint.Client.Search.Query;
using SharepointToolbox.Core.Helpers;
using SharepointToolbox.Core.Models;
using SharepointToolbox.Services.Export;
namespace SharepointToolbox.Services;
/// <summary>
/// Duplicate file and folder detection.
/// Files: Search API (same KQL engine as SearchService) + client-side composite key grouping.
/// Folders: CSOM CAML FSObjType=1 via SharePointPaginationHelper + composite key grouping.
/// Port of PS Find-DuplicateFiles / Find-DuplicateFolders (PS lines 4942-5036).
/// </summary>
public class DuplicatesService : IDuplicatesService
{
// SharePoint Search REST API caps RowLimit at 500 per request; larger values are silently clamped.
private const int BatchSize = 500;
// SharePoint Search hard ceiling — StartRow > 50,000 returns an error regardless of pagination state.
// See https://learn.microsoft.com/sharepoint/dev/general-development/customizing-search-results-in-sharepoint
private const int MaxStartRow = 50_000;
/// <summary>
/// Scans a site for duplicate files or folders and groups matches by the
/// composite key configured in <paramref name="options"/> (name plus any
/// of size / created / modified / subfolder-count / file-count).
/// File mode uses the SharePoint Search API — it is fast but capped at
/// 50,000 rows (see <see cref="MaxStartRow"/>). Folder mode uses paginated
/// CSOM CAML over every document library on the site. Groups with fewer
/// than two items are dropped before return.
/// </summary>
/// <param name="ctx">Authenticated <see cref="ClientContext"/> for the target site.</param>
/// <param name="options">Scope (Files/Folders), optional library filter, and match-key toggles.</param>
/// <param name="progress">Receives row-count progress during collection.</param>
/// <param name="ct">Cancellation token — honoured between paged requests.</param>
/// <returns>Duplicate groups ordered by descending size, then name.</returns>
public async Task<IReadOnlyList<DuplicateGroup>> ScanDuplicatesAsync(
ClientContext ctx,
DuplicateScanOptions options,
IProgress<OperationProgress> progress,
CancellationToken ct)
{
ct.ThrowIfCancellationRequested();
List<DuplicateItem> allItems;
if (options.Mode == "Folders")
allItems = await CollectFolderItemsAsync(ctx, options, progress, ct);
else
allItems = await CollectFileItemsAsync(ctx, options, progress, ct);
progress.Report(OperationProgress.Indeterminate($"Grouping {allItems.Count:N0} items by duplicate key\u2026"));
var groups = allItems
.GroupBy(item => MakeKey(item, options))
.Where(g => g.Count() >= 2)
.Select(g =>
{
var items = g.ToList();
var name = items[0].Name;
var libraries = items
.Select(i => i.Library)
.Where(l => !string.IsNullOrEmpty(l))
.Distinct(StringComparer.OrdinalIgnoreCase)
.OrderBy(l => l, StringComparer.OrdinalIgnoreCase)
.ToList();
return new DuplicateGroup
{
GroupKey = g.Key,
Name = libraries.Count > 0
? $"{name} ({string.Join(", ", libraries)})"
: name,
Items = items
};
})
.OrderByDescending(g => g.Items.Count)
.ThenBy(g => g.Name)
.ToList();
return groups;
}
// ── File collection via Search API ────────────────────────────────────────
private static async Task<List<DuplicateItem>> CollectFileItemsAsync(
ClientContext ctx,
DuplicateScanOptions options,
IProgress<OperationProgress> progress,
CancellationToken ct)
{
var (siteUrl, siteTitle) = await LoadSiteIdentityAsync(ctx, progress, ct);
// KQL: all documents, optionally scoped to a library
var kqlParts = new List<string> { "ContentType:Document" };
if (!string.IsNullOrEmpty(options.Library))
kqlParts.Add($"Path:\"{ctx.Url.TrimEnd('/')}/{options.Library.TrimStart('/')}*\"");
string kql = string.Join(" AND ", kqlParts);
var allItems = new List<DuplicateItem>();
int startRow = 0;
do
{
ct.ThrowIfCancellationRequested();
var kq = new KeywordQuery(ctx)
{
QueryText = kql,
StartRow = startRow,
RowLimit = BatchSize,
TrimDuplicates = false
};
foreach (var prop in new[] { "Title", "Path", "FileExtension", "Created",
"LastModifiedTime", "Size", "ParentLink" })
kq.SelectProperties.Add(prop);
var executor = new SearchExecutor(ctx);
ClientResult<ResultTableCollection> clientResult = executor.ExecuteQuery(kq);
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
var table = clientResult.Value
.FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults);
if (table == null || table.RowCount == 0) break;
foreach (var rawRow in table.ResultRows)
{
// CSOM has returned ResultRows as either Hashtable or
// Dictionary<string,object> across versions — accept both.
IDictionary<string, object> dict;
if (rawRow is IDictionary<string, object> generic)
{
dict = generic;
}
else if (rawRow is System.Collections.IDictionary legacy)
{
dict = new Dictionary<string, object>();
foreach (System.Collections.DictionaryEntry e in legacy)
dict[e.Key.ToString()!] = e.Value ?? string.Empty;
}
else
{
continue;
}
string path = GetStr(dict, "Path");
if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase))
continue;
string name = System.IO.Path.GetFileName(path);
if (string.IsNullOrEmpty(name))
name = GetStr(dict, "Title");
string raw = GetStr(dict, "Size");
string digits = System.Text.RegularExpressions.Regex.Replace(raw, "[^0-9]", "");
long size = long.TryParse(digits, out var sv) ? sv : 0L;
DateTime? created = ParseDate(GetStr(dict, "Created"));
DateTime? modified = ParseDate(GetStr(dict, "LastModifiedTime"));
// Derive library from path segments
string library = ExtractLibraryFromPath(path, ctx.Url);
allItems.Add(new DuplicateItem
{
Name = name,
Path = path,
Library = library,
SizeBytes = size,
Created = created,
Modified = modified,
SiteUrl = siteUrl,
SiteTitle = siteTitle
});
}
progress.Report(new OperationProgress(allItems.Count, MaxStartRow,
$"Collected {allItems.Count:N0} files\u2026"));
startRow += BatchSize;
}
while (startRow <= MaxStartRow);
return allItems;
}
// ── Folder collection via CAML ────────────────────────────────────────────
private static async Task<List<DuplicateItem>> CollectFolderItemsAsync(
ClientContext ctx,
DuplicateScanOptions options,
IProgress<OperationProgress> progress,
CancellationToken ct)
{
// Load all document libraries on the site
ctx.Load(ctx.Web,
w => w.Title,
w => w.Lists.Include(
l => l.Title, l => l.Hidden, l => l.BaseType));
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
var siteUrl = ctx.Url;
var siteTitle = string.IsNullOrWhiteSpace(ctx.Web.Title)
? ReportSplitHelper.DeriveSiteLabel(siteUrl)
: ctx.Web.Title;
var libs = ctx.Web.Lists
.Where(l => !l.Hidden && l.BaseType == BaseType.DocumentLibrary)
.ToList();
// Filter to specific library if requested
if (!string.IsNullOrEmpty(options.Library))
{
libs = libs
.Where(l => l.Title.Equals(options.Library, StringComparison.OrdinalIgnoreCase))
.ToList();
}
// No WHERE clause — a WHERE on non-indexed fields (FSObjType) throws the
// list-view threshold on libraries > 5,000 items even with pagination.
// Filter for folders client-side via FileSystemObjectType below.
var camlQuery = new CamlQuery
{
ViewXml = """
<View Scope='RecursiveAll'>
<Query></Query>
<RowLimit Paged='TRUE'>5000</RowLimit>
</View>
"""
};
var allItems = new List<DuplicateItem>();
foreach (var lib in libs)
{
ct.ThrowIfCancellationRequested();
progress.Report(OperationProgress.Indeterminate($"Scanning folders in {lib.Title}\u2026"));
await foreach (var item in SharePointPaginationHelper.GetAllItemsAsync(ctx, lib, camlQuery, ct))
{
ct.ThrowIfCancellationRequested();
if (item.FileSystemObjectType != FileSystemObjectType.Folder) continue;
var fv = item.FieldValues;
string name = fv["FileLeafRef"]?.ToString() ?? string.Empty;
string fileRef = fv["FileRef"]?.ToString() ?? string.Empty;
int subCount = Convert.ToInt32(fv["FolderChildCount"] ?? 0);
int childCount = Convert.ToInt32(fv["ItemChildCount"] ?? 0);
int fileCount = Math.Max(0, childCount - subCount);
DateTime? created = fv["Created"] is DateTime cr ? cr : (DateTime?)null;
DateTime? modified = fv["Modified"] is DateTime md ? md : (DateTime?)null;
allItems.Add(new DuplicateItem
{
Name = name,
Path = fileRef,
Library = lib.Title,
FolderCount = subCount,
FileCount = fileCount,
Created = created,
Modified = modified,
SiteUrl = siteUrl,
SiteTitle = siteTitle
});
}
}
return allItems;
}
// ── Composite key builder (matches test scaffold in DuplicatesServiceTests) ──
internal static string MakeKey(DuplicateItem item, DuplicateScanOptions opts)
{
var parts = new List<string> { item.Name.ToLowerInvariant() };
if (opts.MatchSize && item.SizeBytes.HasValue) parts.Add(item.SizeBytes.Value.ToString());
if (opts.MatchCreated && item.Created.HasValue) parts.Add(item.Created.Value.Date.ToString("yyyy-MM-dd"));
if (opts.MatchModified && item.Modified.HasValue) parts.Add(item.Modified.Value.Date.ToString("yyyy-MM-dd"));
if (opts.MatchSubfolderCount && item.FolderCount.HasValue) parts.Add(item.FolderCount.Value.ToString());
if (opts.MatchFileCount && item.FileCount.HasValue) parts.Add(item.FileCount.Value.ToString());
return string.Join("|", parts);
}
// ── Private utilities ─────────────────────────────────────────────────────
private static string GetStr(IDictionary<string, object> r, string key) =>
r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty;
private static DateTime? ParseDate(string s) =>
DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null;
private static async Task<(string Url, string Title)> LoadSiteIdentityAsync(
ClientContext ctx, IProgress<OperationProgress> progress, CancellationToken ct)
{
try
{
ctx.Load(ctx.Web, w => w.Title);
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
}
catch (OperationCanceledException)
{
throw;
}
catch (Exception ex)
{
// Best-effort — fall back to URL-derived label
Debug.WriteLine($"[DuplicatesService] LoadSiteIdentityAsync: failed to load Web.Title: {ex.GetType().Name}: {ex.Message}");
}
var url = ctx.Url ?? string.Empty;
string title;
try { title = ctx.Web.Title; }
catch (Exception ex)
{
Debug.WriteLine($"[DuplicatesService] LoadSiteIdentityAsync: Web.Title getter threw: {ex.GetType().Name}: {ex.Message}");
title = string.Empty;
}
if (string.IsNullOrWhiteSpace(title))
title = ReportSplitHelper.DeriveSiteLabel(url);
return (url, title);
}
private static string ExtractLibraryFromPath(string path, string siteUrl)
{
// Extract first path segment after the site URL as library name
// e.g. https://tenant.sharepoint.com/sites/MySite/Shared Documents/file.docx -> "Shared Documents"
if (string.IsNullOrEmpty(path) || string.IsNullOrEmpty(siteUrl))
return string.Empty;
string relative = path.StartsWith(siteUrl.TrimEnd('/'), StringComparison.OrdinalIgnoreCase)
? path.Substring(siteUrl.TrimEnd('/').Length).TrimStart('/')
: path;
int slash = relative.IndexOf('/');
return slash > 0 ? relative.Substring(0, slash) : relative;
}
}