diff --git a/docs/docs/pdf.md b/docs/docs/pdf.md index 52934054753..bf9c296045c 100644 --- a/docs/docs/pdf.md +++ b/docs/docs/pdf.md @@ -75,6 +75,33 @@ Sets the PDF output file name. The default value is `toc.pdf`. Indicates whether to include a "Table of Contents" pages at the beginning. +### `pdfTocSource` + +Controls the source for the PDF Table of Contents. Possible values: + +- `toc` (default): Generates TOC from the `toc.yml` structure. +- `headings`: Generates TOC from headings (h1, h2, h3, etc.) extracted from document content. + +When set to `headings`, the TOC reflects the actual heading structure within each document rather than the navigation defined in `toc.yml`. This is useful for single-document or small documentation sets where you want the PDF TOC to show the internal sections of each document. + +```yaml +pdf: true +pdfTocPage: true +pdfTocSource: headings +items: +- name: My Document + href: my-document.md +``` + +### `pdfTocHeadingDepth` + +Maximum heading level to include in the PDF TOC when `pdfTocSource` is `headings`. Default is `3`, which includes h1, h2, and h3 headings. Set to a higher value (up to 6) to include deeper heading levels. + +```yaml +pdfTocSource: headings +pdfTocHeadingDepth: 4 # Include h1-h4 headings +``` + ### `pdfCoverPage` A path to an HTML page relative to the root of the output directory. The HTML page will be inserted at the beginning of the PDF file as cover page. diff --git a/schemas/toc.schema.json b/schemas/toc.schema.json index 3fdce6ec949..f1ad45e1efd 100644 --- a/schemas/toc.schema.json +++ b/schemas/toc.schema.json @@ -106,6 +106,19 @@ "type": "boolean", "default": false, "description": "If set to true, Child items are displayed as dropdown on top navigation bar." + }, + "pdfTocSource": { + "type": "string", + "enum": ["toc", "headings"], + "default": "toc", + "description": "Source for PDF Table of Contents. 'toc' uses toc.yml structure (default), 'headings' extracts headings from document content." + }, + "pdfTocHeadingDepth": { + "type": "integer", + "minimum": 1, + "maximum": 6, + "default": 3, + "description": "Maximum heading level to include in PDF TOC when pdfTocSource is 'headings'. For example, 3 includes h1, h2, and h3." } } }, diff --git a/src/Docfx.App/PdfBuilder.cs b/src/Docfx.App/PdfBuilder.cs index 1357cd8236c..29077ea85b4 100644 --- a/src/Docfx.App/PdfBuilder.cs +++ b/src/Docfx.App/PdfBuilder.cs @@ -37,6 +37,15 @@ static class PdfBuilder { private static readonly SearchValues InvalidPathChars = SearchValues.Create(Path.GetInvalidPathChars()); + class HeadingInfo + { + public string Text { get; init; } = ""; + public string Id { get; init; } = ""; + public int Level { get; init; } + public Uri PageUrl { get; init; } = null!; + public int PageNumber { get; set; } + } + class Outline { public string name { get; init; } = ""; @@ -51,6 +60,9 @@ class Outline public string? pdfHeaderTemplate { get; init; } public string? pdfFooterTemplate { get; init; } + + public string? pdfTocSource { get; init; } + public int pdfTocHeadingDepth { get; init; } = 3; } public static Task Run(BuildJsonConfig config, string configDirectory, string? outputDirectory = null, CancellationToken cancellationToken = default) @@ -93,6 +105,7 @@ void onSignal(PosixSignalContext context) Uri? baseUrl = null; var pdfPageNumbers = new ConcurrentDictionary>(); + var pdfHeadings = new ConcurrentDictionary>(); using var app = builder.Build(); app.UseServe(outputFolder); @@ -127,6 +140,7 @@ void onSignal(PosixSignalContext context) await CreatePdf( PrintPdf, PrintHeaderFooter, task, new(baseUrl, url), toc, outputFolder, pdfOutputPath, pageNumbers => pdfPageNumbers[url] = pageNumbers, + headings => pdfHeadings[url] = headings, cancellationToken); task.Value = task.MaxValue; @@ -186,20 +200,22 @@ await CreatePdf( IResult TocPage(string url) { var pageNumbers = pdfPageNumbers.GetValueOrDefault(url); - return Results.Content(TocHtmlTemplate(new Uri(baseUrl!, url), pdfTocs[url], pageNumbers).ToString(), "text/html", Encoding.UTF8); + var headings = pdfHeadings.GetValueOrDefault(url); + return Results.Content(TocHtmlTemplate(new Uri(baseUrl!, url), pdfTocs[url], pageNumbers, headings).ToString(), "text/html", Encoding.UTF8); } - async Task PrintPdf(Outline outline, Uri url) + async Task<(byte[]? bytes, List headings)> PrintPdf(Outline outline, Uri url, int headingDepth) { await pageLimiter.WaitAsync(cancellationToken); var page = pagePool.TryTake(out var pooled) ? pooled : await context.NewPageAsync(); + var headings = new List(); try { Uri beforeUri = new(page.Url); var response = await page.GotoAsync(url.ToString(), new() { WaitUntil = WaitUntilState.DOMContentLoaded }); if (response?.Status is 404) - return null; + return (null, headings); bool isSameUrlNavigation = response == null && beforeUri == url; bool isHashFragmentNavigation = response == null @@ -234,11 +250,19 @@ IResult TocPage(string url) } } - return await page.PdfAsync(new PagePdfOptions + // Extract headings from the page if needed + if (outline.pdfTocSource == "headings" && headingDepth > 0 && !IsTocPage(url) && !IsCoverPage(url, outputFolder, outline.pdfCoverPage)) + { + headings = await ExtractHeadingsFromPage(page, url, headingDepth); + } + + var bytes = await page.PdfAsync(new PagePdfOptions { PreferCSSPageSize = true, PrintBackground = outline.pdfPrintBackground, }); + + return (bytes, headings); } finally { @@ -247,6 +271,45 @@ IResult TocPage(string url) } } + async Task> ExtractHeadingsFromPage(IPage page, Uri pageUrl, int maxDepth) + { + var headings = new List(); + var selector = string.Join(",", Enumerable.Range(1, maxDepth).Select(i => $"article h{i}, .content h{i}")); + + try + { + var elements = await page.QuerySelectorAllAsync(selector); + foreach (var element in elements) + { + var tagName = await element.EvaluateAsync("e => e.tagName"); + var level = int.Parse(tagName[1].ToString()); + var id = await element.GetAttributeAsync("id") ?? ""; + var text = (await element.InnerTextAsync()).Trim(); + + // Skip headings without id or text + if (string.IsNullOrEmpty(id) || string.IsNullOrEmpty(text)) + continue; + + // Clean up text (remove source link icons, etc.) + var cleanText = text.Split('\n')[0].Trim(); + + headings.Add(new HeadingInfo + { + Text = cleanText, + Id = id, + Level = level, + PageUrl = pageUrl + }); + } + } + catch (Exception ex) + { + Logger.LogWarning($"Failed to extract headings from {pageUrl}: {ex.Message}"); + } + + return headings; + } + Task PrintHeaderFooter(Outline toc, int pageNumber, int totalPages, Page contentPage) { var headerTemplate = ExpandTemplate(GetHeaderFooter(toc.pdfHeaderTemplate), pageNumber, totalPages); @@ -333,14 +396,15 @@ static string ExpandTemplate(string? pdfTemplate, int pageNumber, int totalPages } static async Task CreatePdf( - Func> printPdf, Func> printHeaderFooter, ProgressTask task, - Uri outlineUrl, Outline outline, string outputFolder, string pdfOutputPath, Action> updatePageNumbers, CancellationToken cancellationToken) + Func headings)>> printPdf, Func> printHeaderFooter, ProgressTask task, + Uri outlineUrl, Outline outline, string outputFolder, string pdfOutputPath, Action> updatePageNumbers, Action> updateHeadings, CancellationToken cancellationToken) { var pages = GetPages(outline).ToArray(); if (pages.Length == 0) return; var pageBytes = new Dictionary(); + var pageHeadings = new Dictionary>(); // Make progress at 99% before merge PDF task.MaxValue = pages.Length + (pages.Length / 99.0); @@ -348,17 +412,58 @@ static async Task CreatePdf( await Parallel.ForEachAsync(pages, new ParallelOptions { CancellationToken = cancellationToken }, async (item, _) => { var (url, node) = item; - if (await printPdf(outline, url) is { } bytes) + + // Skip TOC pages - they depend on data from content pages (headings, page numbers) + // and will be rendered later once that data is available. + if (IsTocPage(url)) + { + task.Increment(1); + return; + } + + var result = await printPdf(outline, url, outline.pdfTocHeadingDepth); + if (result.bytes is { } bytes) { lock (pageBytes) pageBytes[node] = bytes; } + if (result.headings.Count > 0) + { + lock (pageHeadings) + pageHeadings[node] = result.headings; + } task.Increment(1); }); + // Collect headings in document order: + // - Page order: preserved by iterating `pages` array (parallel processing loses this) + // - Within-page order: preserved by DOM order from QuerySelectorAllAsync + var allHeadings = pages + .Where(p => pageHeadings.ContainsKey(p.node)) + .SelectMany(p => pageHeadings[p.node]) + .ToList(); + + // Update headings before page numbers are calculated + updateHeadings(allHeadings); + + // Render the TOC page now that headings are available. + // This is deferred from the parallel render because the TOC content depends on + // data extracted from content pages (headings for heading-based TOC). + // Rendering it here ensures the correct page count before calculating page numbers. + foreach (var (tocUrl, tocNode) in pages) + { + if (!IsTocPage(tocUrl)) + continue; + + var result = await printPdf(outline, tocUrl, 0); + if (result.bytes != null) + pageBytes[tocNode] = result.bytes; + } + var pagesByNode = pages.ToDictionary(p => p.node); var pagesByUrl = new Dictionary>(); var pageNumbers = new Dictionary(); + var urlPageNumbers = new Dictionary(); var numberOfPages = 0; foreach (var (url, node) in pages) @@ -379,12 +484,39 @@ static async Task CreatePdf( pageBytes[node] = bytes; pageNumbers[node] = numberOfPages + 1; + urlPageNumbers[CleanUrl(url)] = numberOfPages + 1; numberOfPages += document.NumberOfPages; } if (numberOfPages is 0) return; + // Resolve actual page numbers for headings using named destinations. + // Each heading ID corresponds to a named destination in the document's PDF, + // so we can determine the exact page within the merged PDF. + foreach (var heading in allHeadings) + { + var cleanUrl = CleanUrl(heading.PageUrl); + if (pagesByUrl.TryGetValue(cleanUrl, out var dests)) + { + var resolved = false; + foreach (var (node, namedDests) in dests) + { + if (namedDests.TryGet(heading.Id, out var dest) && dest is not null) + { + heading.PageNumber = pageNumbers[node] - 1 + dest.PageNumber; + resolved = true; + break; + } + } + // Fall back to document start page if heading ID wasn't found in named destinations + if (!resolved && urlPageNumbers.TryGetValue(cleanUrl, out var startPage)) + { + heading.PageNumber = startPage; + } + } + } + var producer = $"docfx ({typeof(PdfBuilder).Assembly.GetCustomAttribute()?.Version})"; using var output = File.Create(pdfOutputPath); @@ -444,7 +576,9 @@ async Task MergePdf() { // Refresh TOC page numbers updatePageNumbers(pageNumbers); - bytes = await printPdf(outline, url); + updateHeadings(allHeadings); + var result = await printPdf(outline, url, 0); // 0 = don't extract headings from TOC page + bytes = result.bytes; if (bytes == null) continue; @@ -525,21 +659,6 @@ PdfAction HandleUriAction(UriAction url) static Uri CleanUrl(Uri url) => new UriBuilder(url) { Query = null, Fragment = null }.Uri; - static bool IsCoverPage(Uri pageUri, string baseFolder, string? pdfCoverPage) - { - Debug.Assert(Path.IsPathFullyQualified(baseFolder)); - - if (string.IsNullOrEmpty(pdfCoverPage)) - return false; - - string pagePath = pageUri.AbsolutePath.TrimStart('/'); - string covePagePath = PathUtility.MakeRelativePath(baseFolder, Path.GetFullPath(Path.Combine(baseFolder, pdfCoverPage))); - - return pagePath.Equals(covePagePath, GetStringComparison()); - } - - static bool IsTocPage(Uri url) => url.AbsolutePath.StartsWith("/_pdftoc/"); - Bookmarks CreateBookmarks(Outline[]? items) { var nextPageNumber = 1; @@ -607,8 +726,40 @@ IEnumerable CreateBookmarksCore(Outline[]? items, int level) } } - static HtmlTemplate TocHtmlTemplate(Uri baseUrl, Outline node, Dictionary? pageNumbers) + static HtmlTemplate TocHtmlTemplate(Uri baseUrl, Outline node, Dictionary? pageNumbers, List? headings) { + // If pdfTocSource is "headings" and we have headings, generate TOC from headings + if (node.pdfTocSource == "headings" && headings is { Count: > 0 }) + { + var headingTocContent = BuildHeadingToc(baseUrl, headings); + var cssStyles = Html($""" + + """); + return Html($""" + + + + + + {cssStyles} + + +

Table of Contents

+
    {headingTocContent}
+ + + """);; + } + + // Default: generate TOC from toc.yml structure return Html($""" @@ -637,6 +788,34 @@ static HtmlTemplate TocHtmlTemplate(Uri baseUrl, Outline node, Dictionary headings) + { + // Build flat list of all headings with CSS-based indentation for hierarchy + var result = new List(); + + foreach (var heading in headings) + { + var href = new UriBuilder(heading.PageUrl) { Fragment = heading.Id }.Uri; + + var pageNumberHtml = heading.PageNumber > 0 + ? Html($" {heading.PageNumber}") + : default; + + // Use data-level attribute for CSS styling of indentation + var item = Html($""" +
  • + {System.Web.HttpUtility.HtmlEncode(heading.Text)} + {pageNumberHtml} + +
  • + """); + + result.Add(item); + } + + return Html($"{result}"); + } + /// /// Adds hidden links to headings to ensure Chromium saves heading anchors to named dests /// for cross page bookmark reference. @@ -726,4 +905,19 @@ private static StringComparison GetStringComparison() ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; } + + private static bool IsTocPage(Uri url) => url.AbsolutePath.StartsWith("/_pdftoc/"); + + private static bool IsCoverPage(Uri pageUri, string baseFolder, string? pdfCoverPage) + { + Debug.Assert(Path.IsPathFullyQualified(baseFolder)); + + if (string.IsNullOrEmpty(pdfCoverPage)) + return false; + + string pagePath = pageUri.AbsolutePath.TrimStart('/'); + string coverPagePath = PathUtility.MakeRelativePath(baseFolder, Path.GetFullPath(Path.Combine(baseFolder, pdfCoverPage))); + + return pagePath.Equals(coverPagePath, GetStringComparison()); + } }