- File mode: Search API KQL pagination matching SearchService pattern - Folder mode: CAML FSObjType=1 via SharePointPaginationHelper.GetAllItemsAsync - MakeKey composite key (name+size+dates+counts) matches DuplicatesServiceTests scaffold - Groups only items with count >= 2, ordered by group size then name - ExtractLibraryFromPath derives library name from path relative to site URL - SelectProperties added per-item (StringCollection has no AddRange)
251 lines
10 KiB
C#
251 lines
10 KiB
C#
using Microsoft.SharePoint.Client;
|
|
using Microsoft.SharePoint.Client.Search.Query;
|
|
using SharepointToolbox.Core.Helpers;
|
|
using SharepointToolbox.Core.Models;
|
|
|
|
namespace SharepointToolbox.Services;
|
|
|
|
/// <summary>
|
|
/// Duplicate file and folder detection.
|
|
/// Files: Search API (same KQL engine as SearchService) + client-side composite key grouping.
|
|
/// Folders: CSOM CAML FSObjType=1 via SharePointPaginationHelper + composite key grouping.
|
|
/// Port of PS Find-DuplicateFiles / Find-DuplicateFolders (PS lines 4942-5036).
|
|
/// </summary>
|
|
public class DuplicatesService : IDuplicatesService
|
|
{
|
|
private const int BatchSize = 500;
|
|
private const int MaxStartRow = 50_000;
|
|
|
|
public async Task<IReadOnlyList<DuplicateGroup>> ScanDuplicatesAsync(
|
|
ClientContext ctx,
|
|
DuplicateScanOptions options,
|
|
IProgress<OperationProgress> progress,
|
|
CancellationToken ct)
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
|
|
List<DuplicateItem> allItems;
|
|
|
|
if (options.Mode == "Folders")
|
|
allItems = await CollectFolderItemsAsync(ctx, options, progress, ct);
|
|
else
|
|
allItems = await CollectFileItemsAsync(ctx, options, progress, ct);
|
|
|
|
progress.Report(OperationProgress.Indeterminate($"Grouping {allItems.Count:N0} items by duplicate key\u2026"));
|
|
|
|
var groups = allItems
|
|
.GroupBy(item => MakeKey(item, options))
|
|
.Where(g => g.Count() >= 2)
|
|
.Select(g => new DuplicateGroup
|
|
{
|
|
GroupKey = g.Key,
|
|
Name = g.First().Name,
|
|
Items = g.ToList()
|
|
})
|
|
.OrderByDescending(g => g.Items.Count)
|
|
.ThenBy(g => g.Name)
|
|
.ToList();
|
|
|
|
return groups;
|
|
}
|
|
|
|
// ── File collection via Search API ────────────────────────────────────────
|
|
|
|
private static async Task<List<DuplicateItem>> CollectFileItemsAsync(
|
|
ClientContext ctx,
|
|
DuplicateScanOptions options,
|
|
IProgress<OperationProgress> progress,
|
|
CancellationToken ct)
|
|
{
|
|
// KQL: all documents, optionally scoped to a library
|
|
var kqlParts = new List<string> { "ContentType:Document" };
|
|
if (!string.IsNullOrEmpty(options.Library))
|
|
kqlParts.Add($"Path:\"{ctx.Url.TrimEnd('/')}/{options.Library.TrimStart('/')}*\"");
|
|
string kql = string.Join(" AND ", kqlParts);
|
|
|
|
var allItems = new List<DuplicateItem>();
|
|
int startRow = 0;
|
|
|
|
do
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
|
|
var kq = new KeywordQuery(ctx)
|
|
{
|
|
QueryText = kql,
|
|
StartRow = startRow,
|
|
RowLimit = BatchSize,
|
|
TrimDuplicates = false
|
|
};
|
|
foreach (var prop in new[] { "Title", "Path", "FileExtension", "Created",
|
|
"LastModifiedTime", "Size", "ParentLink" })
|
|
kq.SelectProperties.Add(prop);
|
|
|
|
var executor = new SearchExecutor(ctx);
|
|
ClientResult<ResultTableCollection> clientResult = executor.ExecuteQuery(kq);
|
|
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
|
|
|
|
var table = clientResult.Value
|
|
.FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults);
|
|
if (table == null || table.RowCount == 0) break;
|
|
|
|
foreach (System.Collections.Hashtable row in table.ResultRows)
|
|
{
|
|
var dict = row.Cast<System.Collections.DictionaryEntry>()
|
|
.ToDictionary(e => e.Key.ToString()!, e => e.Value ?? (object)string.Empty);
|
|
|
|
string path = GetStr(dict, "Path");
|
|
if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase))
|
|
continue;
|
|
|
|
string name = System.IO.Path.GetFileName(path);
|
|
if (string.IsNullOrEmpty(name))
|
|
name = GetStr(dict, "Title");
|
|
|
|
string raw = GetStr(dict, "Size");
|
|
string digits = System.Text.RegularExpressions.Regex.Replace(raw, "[^0-9]", "");
|
|
long size = long.TryParse(digits, out var sv) ? sv : 0L;
|
|
|
|
DateTime? created = ParseDate(GetStr(dict, "Created"));
|
|
DateTime? modified = ParseDate(GetStr(dict, "LastModifiedTime"));
|
|
|
|
// Derive library from path segments
|
|
string library = ExtractLibraryFromPath(path, ctx.Url);
|
|
|
|
allItems.Add(new DuplicateItem
|
|
{
|
|
Name = name,
|
|
Path = path,
|
|
Library = library,
|
|
SizeBytes = size,
|
|
Created = created,
|
|
Modified = modified
|
|
});
|
|
}
|
|
|
|
progress.Report(new OperationProgress(allItems.Count, MaxStartRow,
|
|
$"Collected {allItems.Count:N0} files\u2026"));
|
|
|
|
startRow += BatchSize;
|
|
}
|
|
while (startRow <= MaxStartRow);
|
|
|
|
return allItems;
|
|
}
|
|
|
|
// ── Folder collection via CAML ────────────────────────────────────────────
|
|
|
|
private static async Task<List<DuplicateItem>> CollectFolderItemsAsync(
|
|
ClientContext ctx,
|
|
DuplicateScanOptions options,
|
|
IProgress<OperationProgress> progress,
|
|
CancellationToken ct)
|
|
{
|
|
// Load all document libraries on the site
|
|
ctx.Load(ctx.Web,
|
|
w => w.Lists.Include(
|
|
l => l.Title, l => l.Hidden, l => l.BaseType));
|
|
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
|
|
|
|
var libs = ctx.Web.Lists
|
|
.Where(l => !l.Hidden && l.BaseType == BaseType.DocumentLibrary)
|
|
.ToList();
|
|
|
|
// Filter to specific library if requested
|
|
if (!string.IsNullOrEmpty(options.Library))
|
|
{
|
|
libs = libs
|
|
.Where(l => l.Title.Equals(options.Library, StringComparison.OrdinalIgnoreCase))
|
|
.ToList();
|
|
}
|
|
|
|
var camlQuery = new CamlQuery
|
|
{
|
|
ViewXml = """
|
|
<View Scope='RecursiveAll'>
|
|
<Query>
|
|
<Where>
|
|
<Eq>
|
|
<FieldRef Name='FSObjType' />
|
|
<Value Type='Integer'>1</Value>
|
|
</Eq>
|
|
</Where>
|
|
</Query>
|
|
<RowLimit>2000</RowLimit>
|
|
</View>
|
|
"""
|
|
};
|
|
|
|
var allItems = new List<DuplicateItem>();
|
|
|
|
foreach (var lib in libs)
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
progress.Report(OperationProgress.Indeterminate($"Scanning folders in {lib.Title}\u2026"));
|
|
|
|
await foreach (var item in SharePointPaginationHelper.GetAllItemsAsync(ctx, lib, camlQuery, ct))
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
|
|
var fv = item.FieldValues;
|
|
string name = fv["FileLeafRef"]?.ToString() ?? string.Empty;
|
|
string fileRef = fv["FileRef"]?.ToString() ?? string.Empty;
|
|
int subCount = Convert.ToInt32(fv["FolderChildCount"] ?? 0);
|
|
int childCount = Convert.ToInt32(fv["ItemChildCount"] ?? 0);
|
|
int fileCount = Math.Max(0, childCount - subCount);
|
|
DateTime? created = fv["Created"] is DateTime cr ? cr : (DateTime?)null;
|
|
DateTime? modified = fv["Modified"] is DateTime md ? md : (DateTime?)null;
|
|
|
|
allItems.Add(new DuplicateItem
|
|
{
|
|
Name = name,
|
|
Path = fileRef,
|
|
Library = lib.Title,
|
|
FolderCount = subCount,
|
|
FileCount = fileCount,
|
|
Created = created,
|
|
Modified = modified
|
|
});
|
|
}
|
|
}
|
|
|
|
return allItems;
|
|
}
|
|
|
|
// ── Composite key builder (matches test scaffold in DuplicatesServiceTests) ──
|
|
|
|
internal static string MakeKey(DuplicateItem item, DuplicateScanOptions opts)
|
|
{
|
|
var parts = new List<string> { item.Name.ToLowerInvariant() };
|
|
if (opts.MatchSize && item.SizeBytes.HasValue) parts.Add(item.SizeBytes.Value.ToString());
|
|
if (opts.MatchCreated && item.Created.HasValue) parts.Add(item.Created.Value.Date.ToString("yyyy-MM-dd"));
|
|
if (opts.MatchModified && item.Modified.HasValue) parts.Add(item.Modified.Value.Date.ToString("yyyy-MM-dd"));
|
|
if (opts.MatchSubfolderCount && item.FolderCount.HasValue) parts.Add(item.FolderCount.Value.ToString());
|
|
if (opts.MatchFileCount && item.FileCount.HasValue) parts.Add(item.FileCount.Value.ToString());
|
|
return string.Join("|", parts);
|
|
}
|
|
|
|
// ── Private utilities ─────────────────────────────────────────────────────
|
|
|
|
private static string GetStr(IDictionary<string, object> r, string key) =>
|
|
r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty;
|
|
|
|
private static DateTime? ParseDate(string s) =>
|
|
DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null;
|
|
|
|
private static string ExtractLibraryFromPath(string path, string siteUrl)
|
|
{
|
|
// Extract first path segment after the site URL as library name
|
|
// e.g. https://tenant.sharepoint.com/sites/MySite/Shared Documents/file.docx -> "Shared Documents"
|
|
if (string.IsNullOrEmpty(path) || string.IsNullOrEmpty(siteUrl))
|
|
return string.Empty;
|
|
|
|
string relative = path.StartsWith(siteUrl.TrimEnd('/'), StringComparison.OrdinalIgnoreCase)
|
|
? path.Substring(siteUrl.TrimEnd('/').Length).TrimStart('/')
|
|
: path;
|
|
|
|
int slash = relative.IndexOf('/');
|
|
return slash > 0 ? relative.Substring(0, slash) : relative;
|
|
}
|
|
}
|