Skip to content
78 changes: 66 additions & 12 deletions src/mkdocs_llmstxt/_internal/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from itertools import chain
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple, cast
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse

import mdformat
from bs4 import BeautifulSoup as Soup
Expand Down Expand Up @@ -53,6 +53,7 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
mkdocs_config: MkDocsConfig
"""The global MkDocs configuration."""

_base_url: str
_sections: dict[str, dict[str, str]]
_file_uris: set[str]
_md_pages: dict[str, _MDPageInfo]
Expand Down Expand Up @@ -88,6 +89,16 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None:
if config.site_url is None:
raise ValueError("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin")
self.mkdocs_config = config

# Use `base_url` if it exists.
if self.config.base_url is not None:
self._base_url = cast("str", self.config.base_url)
else:
# Use `site_url`, which we assume to be always specified.
self._base_url = cast("str", self.mkdocs_config.site_url)
if not self._base_url.endswith("/"):
self._base_url += "/"

return config

def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None: # noqa: ARG002
Expand Down Expand Up @@ -128,25 +139,18 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None
should_autoclean=self.config.autoclean,
preprocess=self.config.preprocess,
path=str(path_md),
base_uri=self._base_url,
page_uri=page.file.dest_uri,
)

md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix()
# Apply the same logic as in the `Page.url` property.
if md_url in (".", "./"):
md_url = ""

# Use `base_url` if it exists.
if self.config.base_url is not None:
base = cast("str", self.config.base_url)
else:
# Use `site_url`, which we assume to be always specified.
base = cast("str", self.mkdocs_config.site_url)
if not base.endswith("/"):
base += "/"
md_url = urljoin(base, md_url)
md_url = urljoin(self._base_url, md_url)

self._md_pages[src_uri] = _MDPageInfo(
title=page.title if page.title is not None else src_uri,
title=str(page.title) if page.title is not None else src_uri,
path_md=path_md,
md_url=md_url,
content=page_md,
Expand Down Expand Up @@ -221,6 +225,8 @@ def _generate_page_markdown(
should_autoclean: bool,
preprocess: str | None,
path: str,
base_uri: str,
page_uri: str,
) -> str:
"""Convert HTML to Markdown.

Expand All @@ -229,6 +235,8 @@ def _generate_page_markdown(
should_autoclean: Whether to autoclean the HTML.
preprocess: An optional path of a Python module containing a `preprocess` function.
path: The output path of the relevant Markdown file.
base_uri: The base URI of the site.
page_uri: The destination URI of the page.

Returns:
The Markdown content.
Expand All @@ -238,8 +246,54 @@ def _generate_page_markdown(
autoclean(soup)
if preprocess:
_preprocess(soup, preprocess, path)
_convert_to_absolute_links(soup, base_uri, page_uri)
return mdformat.text(
_converter.convert_soup(soup),
options={"wrap": "no"},
extensions=("tables",),
)


def _convert_to_absolute_links(soup: Soup, base_uri: str, page_uri: str) -> None:
"""Convert relative links to absolute ones in the HTML.

Parameters:
soup: The soup to modify.
base_uri: The base URI of the site.
page_uri: The destination URI of the page.
"""
current_dir = Path(page_uri).parent.as_posix()

# Find all anchor tags with `href` attributes.
for link in soup.find_all("a", href=True):
href = link.get("href")

# Skip if `href` is not a string or is empty.
if not isinstance(href, str) or not href:
continue

# Skip if it's an absolute path
if href.startswith("/"):
continue

# Skip if it's an anchor link (starts with `#`).
if href.startswith("#"):
continue

# Skip if it's an external link
try:
if urlparse(href).scheme:
continue
except ValueError:
# Invalid URL, skip
continue

# Relative path from current directory.
relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri
final_href = urljoin(relative_base, href)

# Convert directory paths (ending with `/`) to point to `index.md` files.
if final_href.endswith("/"):
final_href = final_href + "index.md"

link["href"] = final_href
52 changes: 52 additions & 0 deletions tests/test_plugin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Tests for the plugin."""

from pathlib import Path
from textwrap import dedent

import pytest
from mkdocs.commands.build import build
Expand All @@ -20,14 +21,33 @@
"sections": {
"Index": ["index.md"],
"Usage": [{"page1.md": "Some usage docs."}],
"Links": [{"page2.md": "Page with links."}],
},
},
},
],
},
"pages": {
"index.md": "# Hello world",
"dummy.md": "# Hello world",
"page1.md": "# Usage\n\nSome paragraph.",
"page2.md": dedent(
"""
# Links

[Relative link 1](./index.md)
[Relative link 2](./page1.md)
[Relative link 3](dummy.md)
[Absolute link 1](/abs1/)
[Absolute link 2](/abs2/index.md)
[External link](https://example.com)
[Anchor link](#section)
[Email link 1](mailto:test1@example.com)
<test2@example.com>
[External protocol 1](ftp://example1.com)
[External protocol 2](ftp://example2.com/my/)
""",
),
},
},
],
Expand Down Expand Up @@ -56,3 +76,35 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None:
page1md = Path(mkdocs_conf.site_dir, "page1/index.md")
assert page1md.exists()
assert "Some paragraph." in page1md.read_text()

page2md = Path(mkdocs_conf.site_dir, "page2/index.md")
assert page2md.exists()
page2md_content = page2md.read_text()

# Check that relative links are made absolute in each page and in the full llmstxt file.
assert "(https://example.org/en/0.1.34/index.md)" in page2md_content
assert "(https://example.org/en/0.1.34/page1/index.md)" in page2md_content
assert "(https://example.org/en/0.1.34/dummy/index.md)" in page2md_content

assert "(/abs1/)" in page2md_content # absolute link unchanged
assert "(/abs2/index.md)" in page2md_content # absolute link unchanged

assert "(https://example.com)" in page2md_content # External link unchanged
assert "(#section)" in page2md_content # Anchor link unchanged
assert "(mailto:test1@example.com)" in page2md_content
assert "(mailto:test2@example.com)" in page2md_content
assert "(ftp://example1.com)" in page2md_content
assert "(ftp://example2.com/my/)" in page2md_content # index.md not included

# Check that llmstxt pages (Markdown) contain links to other llmstxt pages, not HTML ones.
assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content
assert '"https://example.org/en/0.1.34/page1/"' not in llmsfulltxt_content
assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmsfulltxt_content

assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content
assert '"https://example.org/en/0.1.34/page1/"' not in llmsfulltxt_content
assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmsfulltxt_content

assert '"https://example.org/en/0.1.34/index.html"' not in llmstxt_content
assert '"https://example.org/en/0.1.34/page1/"' not in llmstxt_content
assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmstxt_content