diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index f255734..bb075a7 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -6,7 +6,7 @@ from itertools import chain from pathlib import Path from typing import TYPE_CHECKING, NamedTuple, cast -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse import mdformat from bs4 import BeautifulSoup as Soup @@ -53,6 +53,7 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]): mkdocs_config: MkDocsConfig """The global MkDocs configuration.""" + _base_url: str _sections: dict[str, dict[str, str]] _file_uris: set[str] _md_pages: dict[str, _MDPageInfo] @@ -88,6 +89,16 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None: if config.site_url is None: raise ValueError("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin") self.mkdocs_config = config + + # Use `base_url` if it exists. + if self.config.base_url is not None: + self._base_url = cast("str", self.config.base_url) + else: + # Use `site_url`, which we assume to be always specified. + self._base_url = cast("str", self.mkdocs_config.site_url) + if not self._base_url.endswith("/"): + self._base_url += "/" + return config def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None: # noqa: ARG002 @@ -128,25 +139,18 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None should_autoclean=self.config.autoclean, preprocess=self.config.preprocess, path=str(path_md), + base_uri=self._base_url, + page_uri=page.file.dest_uri, ) md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() # Apply the same logic as in the `Page.url` property. if md_url in (".", "./"): md_url = "" - - # Use `base_url` if it exists. - if self.config.base_url is not None: - base = cast("str", self.config.base_url) - else: - # Use `site_url`, which we assume to be always specified. - base = cast("str", self.mkdocs_config.site_url) - if not base.endswith("/"): - base += "/" - md_url = urljoin(base, md_url) + md_url = urljoin(self._base_url, md_url) self._md_pages[src_uri] = _MDPageInfo( - title=page.title if page.title is not None else src_uri, + title=str(page.title) if page.title is not None else src_uri, path_md=path_md, md_url=md_url, content=page_md, @@ -221,6 +225,8 @@ def _generate_page_markdown( should_autoclean: bool, preprocess: str | None, path: str, + base_uri: str, + page_uri: str, ) -> str: """Convert HTML to Markdown. @@ -229,6 +235,8 @@ def _generate_page_markdown( should_autoclean: Whether to autoclean the HTML. preprocess: An optional path of a Python module containing a `preprocess` function. path: The output path of the relevant Markdown file. + base_uri: The base URI of the site. + page_uri: The destination URI of the page. Returns: The Markdown content. @@ -238,8 +246,54 @@ def _generate_page_markdown( autoclean(soup) if preprocess: _preprocess(soup, preprocess, path) + _convert_to_absolute_links(soup, base_uri, page_uri) return mdformat.text( _converter.convert_soup(soup), options={"wrap": "no"}, extensions=("tables",), ) + + +def _convert_to_absolute_links(soup: Soup, base_uri: str, page_uri: str) -> None: + """Convert relative links to absolute ones in the HTML. + + Parameters: + soup: The soup to modify. + base_uri: The base URI of the site. + page_uri: The destination URI of the page. + """ + current_dir = Path(page_uri).parent.as_posix() + + # Find all anchor tags with `href` attributes. + for link in soup.find_all("a", href=True): + href = link.get("href") + + # Skip if `href` is not a string or is empty. + if not isinstance(href, str) or not href: + continue + + # Skip if it's an absolute path + if href.startswith("/"): + continue + + # Skip if it's an anchor link (starts with `#`). + if href.startswith("#"): + continue + + # Skip if it's an external link + try: + if urlparse(href).scheme: + continue + except ValueError: + # Invalid URL, skip + continue + + # Relative path from current directory. + relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri + final_href = urljoin(relative_base, href) + + # Convert directory paths (ending with `/`) to point to `index.md` files. + if final_href.endswith("/"): + final_href = final_href + "index.md" + + link["href"] = final_href diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 69ee121..ff3d8d4 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -1,6 +1,7 @@ """Tests for the plugin.""" from pathlib import Path +from textwrap import dedent import pytest from mkdocs.commands.build import build @@ -20,6 +21,7 @@ "sections": { "Index": ["index.md"], "Usage": [{"page1.md": "Some usage docs."}], + "Links": [{"page2.md": "Page with links."}], }, }, }, @@ -27,7 +29,25 @@ }, "pages": { "index.md": "# Hello world", + "dummy.md": "# Hello world", "page1.md": "# Usage\n\nSome paragraph.", + "page2.md": dedent( + """ + # Links + + [Relative link 1](./index.md) + [Relative link 2](./page1.md) + [Relative link 3](dummy.md) + [Absolute link 1](/abs1/) + [Absolute link 2](/abs2/index.md) + [External link](https://example.com) + [Anchor link](#section) + [Email link 1](mailto:test1@example.com) + + [External protocol 1](ftp://example1.com) + [External protocol 2](ftp://example2.com/my/) + """, + ), }, }, ], @@ -56,3 +76,35 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None: page1md = Path(mkdocs_conf.site_dir, "page1/index.md") assert page1md.exists() assert "Some paragraph." in page1md.read_text() + + page2md = Path(mkdocs_conf.site_dir, "page2/index.md") + assert page2md.exists() + page2md_content = page2md.read_text() + + # Check that relative links are made absolute in each page and in the full llmstxt file. + assert "(https://example.org/en/0.1.34/index.md)" in page2md_content + assert "(https://example.org/en/0.1.34/page1/index.md)" in page2md_content + assert "(https://example.org/en/0.1.34/dummy/index.md)" in page2md_content + + assert "(/abs1/)" in page2md_content # absolute link unchanged + assert "(/abs2/index.md)" in page2md_content # absolute link unchanged + + assert "(https://example.com)" in page2md_content # External link unchanged + assert "(#section)" in page2md_content # Anchor link unchanged + assert "(mailto:test1@example.com)" in page2md_content + assert "(mailto:test2@example.com)" in page2md_content + assert "(ftp://example1.com)" in page2md_content + assert "(ftp://example2.com/my/)" in page2md_content # index.md not included + + # Check that llmstxt pages (Markdown) contain links to other llmstxt pages, not HTML ones. + assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content + assert '"https://example.org/en/0.1.34/page1/"' not in llmsfulltxt_content + assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmsfulltxt_content + + assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content + assert '"https://example.org/en/0.1.34/page1/"' not in llmsfulltxt_content + assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmsfulltxt_content + + assert '"https://example.org/en/0.1.34/index.html"' not in llmstxt_content + assert '"https://example.org/en/0.1.34/page1/"' not in llmstxt_content + assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmstxt_content