Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions scripts/notion-fetch/contentSanitizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -226,4 +226,37 @@ echo "# Not a heading"
});
});
});

describe("injectExplicitHeadingIds", () => {
it("should normalize accented headings and append stable duplicate suffixes", () => {
const input = [
"# Título Único",
"## Título Único",
"### Niño & Acción",
].join("\n");

const result = scriptModule.injectExplicitHeadingIds(input);

expect(result).toContain("# Título Único {#titulo-unico}");
expect(result).toContain("## Título Único {#titulo-unico-1}");
expect(result).toContain("### Niño & Acción {#nino-accion}");
});

it("should preserve existing explicit heading ids and code fences", () => {
const input = [
"# Encabezado {#custom-id}",
"```md",
"## Código Único",
"```",
"## Otro Título",
].join("\n");

const result = scriptModule.injectExplicitHeadingIds(input);

expect(result).toContain("# Encabezado {#custom-id}");
expect(result).toContain("```md\n## Código Único\n```");
expect(result).toContain("## Otro Título {#otro-titulo}");
expect(result).not.toContain("## Código Único {#codigo-unico}");
});
});
});
78 changes: 77 additions & 1 deletion scripts/notion-fetch/contentSanitizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* that cause MDX compilation errors in Docusaurus.
*/

import { createSafeSlug } from "./slugUtils";

const EMOJI_STYLE_MARKERS = ["display:", "height:", "margin:"];

const isEmojiStyleObject = (snippet: string): boolean =>
Expand Down Expand Up @@ -68,6 +70,80 @@ function fixHeadingHierarchy(
return fixedLines.join("\n");
}

function maskCodeFences(content: string): {
content: string;
codeBlocks: string[];
codeBlockPlaceholders: string[];
} {
const codeBlocks: string[] = [];
const codeBlockPlaceholders: string[] = [];

const maskedContent = content.replace(
/^```[^\n]*\n[\s\S]*?^```/gm,
(match) => {
Comment thread
luandro marked this conversation as resolved.
Outdated
codeBlocks.push(match);
const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`;
codeBlockPlaceholders.push(placeholder);
return placeholder;
}
);

return {
content: maskedContent,
codeBlocks,
codeBlockPlaceholders,
};
}

function restoreCodeFences(content: string, codeBlocks: string[]): string {
return content.replace(
/__CODEBLOCK_(\d+)__/g,
(_match, index) => codeBlocks[Number(index)]
);
}

export function injectExplicitHeadingIds(content: string): string {
if (!content) {
return content;
}

const {
content: maskedContent,
codeBlocks,
codeBlockPlaceholders,
} = maskCodeFences(content);
const headingCounts = new Map<string, number>();

const lines = maskedContent.split("\n");
const updatedLines = lines.map((line) => {
if (
codeBlockPlaceholders.some((placeholder) => line.includes(placeholder)) ||
/\s\{#[^}]+\}\s*$/.test(line)
) {
return line;
Comment thread
luandro marked this conversation as resolved.
Outdated
}

const headingMatch = line.match(/^(\s{0,3})(#{1,6})\s+(.+?)\s*$/);
if (!headingMatch) {
return line;
}

const [, leadingWhitespace, hashes, headingText] = headingMatch;
const baseId = createSafeSlug(headingText);
if (!baseId) {
return line;
}

const currentCount = headingCounts.get(baseId) ?? 0;
headingCounts.set(baseId, currentCount + 1);
const headingId = currentCount === 0 ? baseId : `${baseId}-${currentCount}`;

return `${leadingWhitespace}${hashes} ${headingText} {#${headingId}}`;
});

return restoreCodeFences(updatedLines.join("\n"), codeBlocks);
}

/**
* Sanitizes markdown content to fix malformed HTML/JSX tags that cause MDX compilation errors
* @param content - The markdown content string
Expand All @@ -81,7 +157,7 @@ export function sanitizeMarkdownContent(content: string): string {
const codeSpans: string[] = [];
const codeBlockPlaceholders: string[] = [];

content = content.replace(/```[\s\S]*?```/g, (m) => {
content = content.replace(/^```[^\n]*\n[\s\S]*?^```/gm, (m) => {
codeBlocks.push(m);
const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`;
codeBlockPlaceholders.push(placeholder);
Comment thread
luandro marked this conversation as resolved.
Outdated
Expand Down
152 changes: 152 additions & 0 deletions scripts/notion-fetch/generateBlocks.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ vi.mock("./imageProcessor", () => ({

vi.mock("./utils", () => ({
sanitizeMarkdownContent: vi.fn((content) => content),
injectExplicitHeadingIds: vi.fn((content) => content),
compressImageToFileWithFallback: vi.fn(),
detectFormatFromBuffer: vi.fn(() => "jpeg"),
formatFromContentType: vi.fn(() => "jpeg"),
Expand Down Expand Up @@ -198,6 +199,7 @@ describe("generateBlocks", () => {
let fetchNotionBlocks: Mock;
let processImage: Mock;
let compressImageToFileWithFallback: Mock;
let injectExplicitHeadingIds: Mock;

beforeEach(async () => {
restoreEnv = installTestNotionEnv();
Expand All @@ -223,6 +225,7 @@ describe("generateBlocks", () => {
const utils = await import("./utils");
compressImageToFileWithFallback =
utils.compressImageToFileWithFallback as Mock;
injectExplicitHeadingIds = utils.injectExplicitHeadingIds as Mock;

// Setup default mock implementations
processImage.mockResolvedValue(mockProcessedImageResult);
Expand Down Expand Up @@ -377,6 +380,155 @@ describe("generateBlocks", () => {
});
});

describe("Localized slug and link normalization", () => {
it("should derive the shared ASCII slug from the grouped title for every locale", async () => {
const { generateBlocks } = await import("./generateBlocks");
const mockWriteFileSync = fs.writeFileSync as Mock;

const mainPage = createMockNotionPage({
id: "main-accented",
title: "Título con acentos",
elementType: "Page",
subItems: ["en-accented", "es-accented", "pt-accented"],
});
const englishPage = createMockNotionPage({
id: "en-accented",
title: "Título con acentos",
language: "English",
elementType: "Page",
});
const spanishPage = createMockNotionPage({
id: "es-accented",
title: "Título con acentos",
language: "Spanish",
elementType: "Page",
});
const portuguesePage = createMockNotionPage({
id: "pt-accented",
title: "Título con acentos",
language: "Portuguese",
elementType: "Page",
});

n2m.pageToMarkdown.mockResolvedValue([]);
n2m.toMarkdownString.mockReturnValue({ parent: "Body content" });

await generateBlocks(
[mainPage, englishPage, spanishPage, portuguesePage],
vi.fn()
);

const markdownPaths = mockWriteFileSync.mock.calls
.map((call) => call[0])
.filter(
(value): value is string =>
typeof value === "string" && value.endsWith(".md")
);

expect(markdownPaths).toEqual(
expect.arrayContaining([
expect.stringContaining("titulo-con-acentos.md"),
expect.stringContaining(
"i18n/pt/docusaurus-plugin-content-docs/current/titulo-con-acentos.md"
),
expect.stringContaining(
"i18n/es/docusaurus-plugin-content-docs/current/titulo-con-acentos.md"
),
])
);
});

it("should normalize localized internal docs links before writing markdown", async () => {
const { generateBlocks } = await import("./generateBlocks");
const mockWriteFileSync = fs.writeFileSync as Mock;

const pageFamily = createMockPageFamily("Página de prueba", "Page");
n2m.pageToMarkdown.mockResolvedValue([]);
n2m.toMarkdownString
.mockReturnValueOnce({
parent:
"[doc](/docs/Guía Rápida#Título Uno) [external](https://example.com/Árbol) [relative](./Guía Local#Título)",
})
.mockReturnValueOnce({
parent:
"[doc](/docs/Guía Rápida#Título Uno) [nested](/docs/Category Name/Sub Página#Título Dos)",
})
.mockReturnValueOnce({
parent: "[doc](/docs/Guía Rápida#Título Uno)",
});

await generateBlocks(pageFamily.pages, vi.fn());

const markdownWrites = mockWriteFileSync.mock.calls.filter(
(call) => typeof call[0] === "string" && call[0].endsWith(".md")
);

const englishOutput = markdownWrites.find(
(call) =>
typeof call[0] === "string" &&
!call[0].includes("/i18n/") &&
call[1].includes("/docs/guia-rapida#titulo-uno")
);
const portugueseOutput = markdownWrites.find(
(call) =>
typeof call[0] === "string" &&
call[0].includes("/i18n/pt/") &&
call[1].includes("/pt/docs/guia-rapida#titulo-uno")
);
const spanishOutput = markdownWrites.find(
(call) =>
typeof call[0] === "string" &&
call[0].includes("/i18n/es/") &&
call[1].includes("/es/docs/guia-rapida#titulo-uno")
);

expect(englishOutput?.[1]).toContain(
"[doc](/docs/guia-rapida#titulo-uno)"
);
expect(englishOutput?.[1]).toContain(
"[external](https://example.com/Árbol)"
);
expect(englishOutput?.[1]).toContain("[relative](./Guía Local#Título)");
expect(portugueseOutput?.[1]).toContain(
"[nested](/pt/docs/category-name/sub-pagina#titulo-dos)"
);
expect(spanishOutput?.[1]).toContain(
"[doc](/es/docs/guia-rapida#titulo-uno)"
);
});

it("should pass the de-duplicated content through heading ID injection before writing", async () => {
const { generateBlocks } = await import("./generateBlocks");
const mockWriteFileSync = fs.writeFileSync as Mock;

const page = createMockNotionPage({
id: "heading-page",
title: "Heading Title",
elementType: "Page",
language: "English",
});

n2m.pageToMarkdown.mockResolvedValue([]);
n2m.toMarkdownString.mockReturnValue({
parent: "# Heading Title\n\n## Título Único\nContent body",
});
injectExplicitHeadingIds.mockImplementation(
(content: string) => `${content}\n<!-- ids injected -->`
);

await generateBlocks([page], vi.fn());

expect(injectExplicitHeadingIds).toHaveBeenCalledWith(
"## Título Único\nContent body"
);

const markdownWrite = mockWriteFileSync.mock.calls.find(
(call) => typeof call[0] === "string" && call[0].endsWith(".md")
);
expect(markdownWrite?.[1]).toContain("<!-- ids injected -->");
});
});

describe("Title fallbacks", () => {
it("should fallback to legacy Title property when Content elements is missing", async () => {
const { generateBlocks } = await import("./generateBlocks");
Expand Down
18 changes: 11 additions & 7 deletions scripts/notion-fetch/generateBlocks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ import type {
import { n2m } from "../notionClient";
import { NOTION_PROPERTIES } from "../constants";
import chalk from "chalk";
import { sanitizeMarkdownContent } from "./utils";
import { sanitizeMarkdownContent, injectExplicitHeadingIds } from "./utils";
import { createSafeSlug } from "./slugUtils";
import { normalizeInternalDocLinks } from "./linkNormalizer";
import config from "../../docusaurus.config";
import SpinnerManager from "./spinnerManager";
import { convertCalloutToAdmonition, isCalloutBlock } from "./calloutProcessor";
Expand Down Expand Up @@ -528,6 +530,10 @@ async function processSinglePage(
emojiCount += result.fallbackEmojiCount;
contentHasS3 = result.containsS3;

markdownString.parent = normalizeInternalDocLinks(
markdownString.parent,
lang
);
markdownString.parent = sanitizeMarkdownContent(markdownString.parent);
Comment thread
luandro marked this conversation as resolved.

markdownString.parent = ensureBlankLineAfterStandaloneBold(
Expand All @@ -538,18 +544,19 @@ async function processSinglePage(
markdownString.parent,
pageTitle
);
const finalContentBody = injectExplicitHeadingIds(contentBody);

const sectionFolderForWrite: Record<string, string | undefined> = {};

sectionFolderForWrite[lang] = currentSectionFolderForLang;

const finalDiagnostics = getImageDiagnostics(markdownString.parent ?? "");
const finalDiagnostics = getImageDiagnostics(finalContentBody ?? "");
contentHasS3 = finalDiagnostics.s3Matches > 0;

writeMarkdownFile(
filePath,
frontmatter,
contentBody,
finalContentBody,
pageTitle,
pageProcessingIndex - 1,
totalPages,
Expand Down Expand Up @@ -887,10 +894,7 @@ export async function generateBlocks(
? sectionTypeRaw.trim()
: String(sectionTypeRaw ?? "").trim();
const normalizedSectionType = sectionTypeString.toLowerCase();
const filename = title
.toLowerCase()
.replace(/\s+/g, "-")
.replace(/[^a-z0-9-]/g, "");
const filename = createSafeSlug(title);

const orderedLocales = getOrderedLocales(Object.keys(pageByLang.content));
for (const lang of orderedLocales) {
Expand Down
Loading
Loading