From 69ea317ae114214669b003dc99417e986f700180 Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Wed, 22 Oct 2025 19:51:09 +0900 Subject: [PATCH 01/12] feat: Enhance link handling - Introduced `index_file_name` option in the plugin configuration with a default value of "index.md". - Refactored link handling in the Markdown generation process to convert relative links to absolute URLs. --- src/mkdocs_llmstxt/_internal/config.py | 1 + src/mkdocs_llmstxt/_internal/plugin.py | 142 ++++++++++++++------- src/mkdocs_llmstxt/_internal/preprocess.py | 5 +- tests/test_plugin.py | 13 ++ 4 files changed, 114 insertions(+), 47 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/config.py b/src/mkdocs_llmstxt/_internal/config.py index cea5c9d..6817e5d 100644 --- a/src/mkdocs_llmstxt/_internal/config.py +++ b/src/mkdocs_llmstxt/_internal/config.py @@ -14,6 +14,7 @@ class _PluginConfig(BaseConfig): base_url = mkconf.Optional(mkconf.Type(str)) markdown_description = mkconf.Optional(mkconf.Type(str)) full_output = mkconf.Optional(mkconf.Type(str)) + index_file_name = mkconf.Type(str, default="index.md") sections = mkconf.DictOfItems( # Each list item can either be: # diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index f255734..05c2d8b 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -122,18 +122,7 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None page: The page object. """ if (src_uri := page.file.src_uri) in self._file_uris: - path_md = Path(page.file.abs_dest_path).with_suffix(".md") - page_md = _generate_page_markdown( - html, - should_autoclean=self.config.autoclean, - preprocess=self.config.preprocess, - path=str(path_md), - ) - - md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() - # Apply the same logic as in the `Page.url` property. - if md_url in (".", "./"): - md_url = "" + page_md = self._generate_page_markdown(html, page) # Use `base_url` if it exists. if self.config.base_url is not None: @@ -143,11 +132,16 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None base = cast("str", self.mkdocs_config.site_url) if not base.endswith("/"): base += "/" + + md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() + # Apply the same logic as in the `Page.url` property. + if md_url in (".", "./"): + md_url = "" md_url = urljoin(base, md_url) self._md_pages[src_uri] = _MDPageInfo( - title=page.title if page.title is not None else src_uri, - path_md=path_md, + title=str(page.title) if page.title is not None else src_uri, + path_md=_get_page_md_path(page), md_url=md_url, content=page_md, ) @@ -199,6 +193,94 @@ def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None: # noqa full_output_file.write_text(full_markdown, encoding="utf8") _logger.debug(f"Generated file /{self.config.full_output}.txt") + def _generate_page_markdown(self, html: str, page: Page) -> str: + """Convert HTML to Markdown. + + Parameters: + html: The HTML content. + page: The page object. + + Returns: + The Markdown content. + """ + soup = Soup(html, "html.parser") + if self.config.autoclean: + autoclean(soup) + if self.config.preprocess: + _preprocess(soup, self.config.preprocess, str(_get_page_md_path(page))) + + # Get base_uri from config + if self.config.base_url is not None: + base_uri = cast("str", self.config.base_url) + else: + base_uri = cast("str", self.mkdocs_config.site_url) + if not base_uri.endswith("/"): + base_uri += "/" + + dest_uri_parent = _get_parent_directory(page.file.dest_uri) + self._handle_links(soup, base_uri, dest_uri_parent) + return mdformat.text( + _converter.convert_soup(soup), + options={"wrap": "no"}, + extensions=("tables",), + ) + + def _handle_links(self, soup: Soup, base_uri: str, current_dir: str) -> None: + """Handle links in the HTML. + + Parameters: + soup: The soup to modify. + base_uri: The base URI of the site. + current_dir: The current directory of the page (relative to site root). + """ + # Find all anchor tags with href attributes + for link in soup.find_all("a", href=True): + href = link.get("href") + + # Skip if href is not a string or is empty + if not isinstance(href, str) or not href: + continue + + # Skip if it's already an absolute URL (starts with http:// or https://) + if href.startswith(("http://", "https://")): + continue + + # Skip if it's a mailto: or other protocol links + if ":" in href and not href.startswith("/"): + continue + + # Skip if it's an anchor link (starts with #) + if href.startswith("#"): + continue + + # Convert relative link to absolute + if href.startswith("/"): + # Absolute path from site root + final_href = urljoin(base_uri, href) + else: + # Relative path from current directory + if current_dir: + relative_base = urljoin(base_uri, current_dir + "/") + else: + relative_base = base_uri + final_href = urljoin(relative_base, href) + + # If the final path ends with /, add index file name + if final_href.endswith("/"): + final_href = final_href + (self.config.index_file_name or "") + + link["href"] = final_href + +def _get_page_md_path(page: Page) -> Path: + return Path(page.file.abs_dest_path).with_suffix(".md") + + +def _get_parent_directory(dest_uri: str) -> str: + if dest_uri == ".": + return "" + else: + return str(Path(dest_uri).parent) + def _language_callback(tag: Tag) -> str: for css_class in chain(tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()): @@ -212,34 +294,4 @@ def _language_callback(tag: Tag) -> str: code_language_callback=_language_callback, escape_underscores=False, heading_style=ATX, -) - - -def _generate_page_markdown( - html: str, - *, - should_autoclean: bool, - preprocess: str | None, - path: str, -) -> str: - """Convert HTML to Markdown. - - Parameters: - html: The HTML content. - should_autoclean: Whether to autoclean the HTML. - preprocess: An optional path of a Python module containing a `preprocess` function. - path: The output path of the relevant Markdown file. - - Returns: - The Markdown content. - """ - soup = Soup(html, "html.parser") - if should_autoclean: - autoclean(soup) - if preprocess: - _preprocess(soup, preprocess, path) - return mdformat.text( - _converter.convert_soup(soup), - options={"wrap": "no"}, - extensions=("tables",), - ) +) \ No newline at end of file diff --git a/src/mkdocs_llmstxt/_internal/preprocess.py b/src/mkdocs_llmstxt/_internal/preprocess.py index eb1d989..ff532f2 100644 --- a/src/mkdocs_llmstxt/_internal/preprocess.py +++ b/src/mkdocs_llmstxt/_internal/preprocess.py @@ -4,11 +4,12 @@ import html import sys +from urllib.parse import urljoin from importlib.util import module_from_spec, spec_from_file_location from typing import TYPE_CHECKING from bs4 import BeautifulSoup as Soup -from bs4 import NavigableString +from bs4.element import NavigableString from mkdocs.exceptions import PluginError if TYPE_CHECKING: @@ -99,4 +100,4 @@ def autoclean(soup: Soup) -> None: # Remove line numbers from code blocks. for element in soup.find_all("table", attrs={"class": "highlighttable"}): - element.replace_with(Soup(f"
{html.escape(element.find('code').get_text())}
", "html.parser")) # type: ignore[union-attr] + element.replace_with(Soup(f"
{html.escape(element.find('code').get_text())}
", "html.parser")) # type: ignore[union-attr] \ No newline at end of file diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 69ee121..fab0708 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -20,6 +20,7 @@ "sections": { "Index": ["index.md"], "Usage": [{"page1.md": "Some usage docs."}], + "Links": [{"page2.md": "Page with links."}], }, }, }, @@ -28,6 +29,7 @@ "pages": { "index.md": "# Hello world", "page1.md": "# Usage\n\nSome paragraph.", + "page2.md": "# Links\n\n[Relative link](../index.md)\n[Absolute link](/page1.md)\n[External link](https://example.com)\n[Anchor link](#section)", }, }, ], @@ -56,3 +58,14 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None: page1md = Path(mkdocs_conf.site_dir, "page1/index.md") assert page1md.exists() assert "Some paragraph." in page1md.read_text() + + # Test relative link conversion + page2md = Path(mkdocs_conf.site_dir, "page2/index.md") + assert page2md.exists() + page2md_content = page2md.read_text() + + # Check that relative links are converted to absolute URLs + assert "https://example.org/en/0.1.34/index.md" in page2md_content # ../index.md converted + assert "https://example.org/page1.md" in page2md_content # /page1.md converted (absolute from domain root) + assert "https://example.com" in page2md_content # External link unchanged + assert "#section" in page2md_content # Anchor link unchanged From 87094aa04db588d9a44fe566340c2ee71eaf9578 Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Wed, 22 Oct 2025 20:07:25 +0900 Subject: [PATCH 02/12] fix: revert preprocess.py --- src/mkdocs_llmstxt/_internal/preprocess.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/preprocess.py b/src/mkdocs_llmstxt/_internal/preprocess.py index ff532f2..4a0af9f 100644 --- a/src/mkdocs_llmstxt/_internal/preprocess.py +++ b/src/mkdocs_llmstxt/_internal/preprocess.py @@ -4,12 +4,11 @@ import html import sys -from urllib.parse import urljoin from importlib.util import module_from_spec, spec_from_file_location from typing import TYPE_CHECKING from bs4 import BeautifulSoup as Soup -from bs4.element import NavigableString +from bs4 import NavigableString from mkdocs.exceptions import PluginError if TYPE_CHECKING: From fb3bde0245bfce9e2caea5487ea6c46c0cc31440 Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Wed, 22 Oct 2025 20:07:46 +0900 Subject: [PATCH 03/12] refactor: remove duplicated code --- src/mkdocs_llmstxt/_internal/plugin.py | 27 +++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index 05c2d8b..391f4a2 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -124,14 +124,8 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None if (src_uri := page.file.src_uri) in self._file_uris: page_md = self._generate_page_markdown(html, page) - # Use `base_url` if it exists. - if self.config.base_url is not None: - base = cast("str", self.config.base_url) - else: - # Use `site_url`, which we assume to be always specified. - base = cast("str", self.mkdocs_config.site_url) - if not base.endswith("/"): - base += "/" + # Get base URL + base = self._get_base_url() md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() # Apply the same logic as in the `Page.url` property. @@ -210,12 +204,7 @@ def _generate_page_markdown(self, html: str, page: Page) -> str: _preprocess(soup, self.config.preprocess, str(_get_page_md_path(page))) # Get base_uri from config - if self.config.base_url is not None: - base_uri = cast("str", self.config.base_url) - else: - base_uri = cast("str", self.mkdocs_config.site_url) - if not base_uri.endswith("/"): - base_uri += "/" + base_uri = self._get_base_url() dest_uri_parent = _get_parent_directory(page.file.dest_uri) self._handle_links(soup, base_uri, dest_uri_parent) @@ -271,6 +260,16 @@ def _handle_links(self, soup: Soup, base_uri: str, current_dir: str) -> None: link["href"] = final_href + def _get_base_url(self) -> str: + if self.config.base_url is not None: + base_url = cast("str", self.config.base_url) + else: + base_url = cast("str", self.mkdocs_config.site_url) + if not base_url.endswith("/"): + base_url += "/" + return base_url + + def _get_page_md_path(page: Page) -> Path: return Path(page.file.abs_dest_path).with_suffix(".md") From a64b9b5a3787ed812d8ecf19c6f5a3aaf6876788 Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Wed, 22 Oct 2025 20:12:57 +0900 Subject: [PATCH 04/12] feat: Add `absolute_link` config - Introduced `absolute_link` option in the plugin configuration to control the conversion of relative links to absolute URLs. --- src/mkdocs_llmstxt/_internal/config.py | 1 + src/mkdocs_llmstxt/_internal/plugin.py | 13 ++++++------- tests/test_plugin.py | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/config.py b/src/mkdocs_llmstxt/_internal/config.py index 6817e5d..37dcddf 100644 --- a/src/mkdocs_llmstxt/_internal/config.py +++ b/src/mkdocs_llmstxt/_internal/config.py @@ -13,6 +13,7 @@ class _PluginConfig(BaseConfig): preprocess = mkconf.Optional(mkconf.File(exists=True)) base_url = mkconf.Optional(mkconf.Type(str)) markdown_description = mkconf.Optional(mkconf.Type(str)) + absolute_link = mkconf.Type(bool, default=False) full_output = mkconf.Optional(mkconf.Type(str)) index_file_name = mkconf.Type(str, default="index.md") sections = mkconf.DictOfItems( diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index 391f4a2..c76f566 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -202,19 +202,18 @@ def _generate_page_markdown(self, html: str, page: Page) -> str: autoclean(soup) if self.config.preprocess: _preprocess(soup, self.config.preprocess, str(_get_page_md_path(page))) - - # Get base_uri from config - base_uri = self._get_base_url() - - dest_uri_parent = _get_parent_directory(page.file.dest_uri) - self._handle_links(soup, base_uri, dest_uri_parent) + if self.config.absolute_link: + base_uri = self._get_base_url() + current_dir = _get_parent_directory(page.file.dest_uri) + self._convert_to_absolute_links(soup, base_uri, current_dir) + return mdformat.text( _converter.convert_soup(soup), options={"wrap": "no"}, extensions=("tables",), ) - def _handle_links(self, soup: Soup, base_uri: str, current_dir: str) -> None: + def _convert_to_absolute_links(self, soup: Soup, base_uri: str, current_dir: str) -> None: """Handle links in the HTML. Parameters: diff --git a/tests/test_plugin.py b/tests/test_plugin.py index fab0708..3da2787 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -17,6 +17,7 @@ "llmstxt": { "full_output": "llms-full.txt", "base_url": "https://example.org/en/0.1.34", + "absolute_link": True, "sections": { "Index": ["index.md"], "Usage": [{"page1.md": "Some usage docs."}], From 9ba9089f43d80694ce1903ad5e64145ef21de266 Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Wed, 22 Oct 2025 20:18:09 +0900 Subject: [PATCH 05/12] style: Run `make format` and `make check` --- src/mkdocs_llmstxt/_internal/plugin.py | 28 ++++++++++------------ src/mkdocs_llmstxt/_internal/preprocess.py | 2 +- tests/test_plugin.py | 2 +- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index c76f566..d96d43a 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -203,7 +203,7 @@ def _generate_page_markdown(self, html: str, page: Page) -> str: if self.config.preprocess: _preprocess(soup, self.config.preprocess, str(_get_page_md_path(page))) if self.config.absolute_link: - base_uri = self._get_base_url() + base_uri = self._get_base_url() current_dir = _get_parent_directory(page.file.dest_uri) self._convert_to_absolute_links(soup, base_uri, current_dir) @@ -224,39 +224,36 @@ def _convert_to_absolute_links(self, soup: Soup, base_uri: str, current_dir: str # Find all anchor tags with href attributes for link in soup.find_all("a", href=True): href = link.get("href") - + # Skip if href is not a string or is empty if not isinstance(href, str) or not href: continue - + # Skip if it's already an absolute URL (starts with http:// or https://) if href.startswith(("http://", "https://")): continue - + # Skip if it's a mailto: or other protocol links if ":" in href and not href.startswith("/"): continue - + # Skip if it's an anchor link (starts with #) if href.startswith("#"): continue - + # Convert relative link to absolute if href.startswith("/"): # Absolute path from site root final_href = urljoin(base_uri, href) else: # Relative path from current directory - if current_dir: - relative_base = urljoin(base_uri, current_dir + "/") - else: - relative_base = base_uri + relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri final_href = urljoin(relative_base, href) - + # If the final path ends with /, add index file name if final_href.endswith("/"): final_href = final_href + (self.config.index_file_name or "") - + link["href"] = final_href def _get_base_url(self) -> str: @@ -267,7 +264,7 @@ def _get_base_url(self) -> str: if not base_url.endswith("/"): base_url += "/" return base_url - + def _get_page_md_path(page: Page) -> Path: return Path(page.file.abs_dest_path).with_suffix(".md") @@ -276,8 +273,7 @@ def _get_page_md_path(page: Page) -> Path: def _get_parent_directory(dest_uri: str) -> str: if dest_uri == ".": return "" - else: - return str(Path(dest_uri).parent) + return str(Path(dest_uri).parent) def _language_callback(tag: Tag) -> str: @@ -292,4 +288,4 @@ def _language_callback(tag: Tag) -> str: code_language_callback=_language_callback, escape_underscores=False, heading_style=ATX, -) \ No newline at end of file +) diff --git a/src/mkdocs_llmstxt/_internal/preprocess.py b/src/mkdocs_llmstxt/_internal/preprocess.py index 4a0af9f..eb1d989 100644 --- a/src/mkdocs_llmstxt/_internal/preprocess.py +++ b/src/mkdocs_llmstxt/_internal/preprocess.py @@ -99,4 +99,4 @@ def autoclean(soup: Soup) -> None: # Remove line numbers from code blocks. for element in soup.find_all("table", attrs={"class": "highlighttable"}): - element.replace_with(Soup(f"
{html.escape(element.find('code').get_text())}
", "html.parser")) # type: ignore[union-attr] \ No newline at end of file + element.replace_with(Soup(f"
{html.escape(element.find('code').get_text())}
", "html.parser")) # type: ignore[union-attr] diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 3da2787..cc5aa50 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -64,7 +64,7 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None: page2md = Path(mkdocs_conf.site_dir, "page2/index.md") assert page2md.exists() page2md_content = page2md.read_text() - + # Check that relative links are converted to absolute URLs assert "https://example.org/en/0.1.34/index.md" in page2md_content # ../index.md converted assert "https://example.org/page1.md" in page2md_content # /page1.md converted (absolute from domain root) From 5dc425828f2a5070222312063509731825f46ceb Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Sat, 25 Oct 2025 20:34:52 +0900 Subject: [PATCH 06/12] feat: Remove `absolute_link` option and make it default behavior --- src/mkdocs_llmstxt/_internal/config.py | 2 -- src/mkdocs_llmstxt/_internal/plugin.py | 19 +++++++++---------- tests/test_plugin.py | 1 - 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/config.py b/src/mkdocs_llmstxt/_internal/config.py index 37dcddf..cea5c9d 100644 --- a/src/mkdocs_llmstxt/_internal/config.py +++ b/src/mkdocs_llmstxt/_internal/config.py @@ -13,9 +13,7 @@ class _PluginConfig(BaseConfig): preprocess = mkconf.Optional(mkconf.File(exists=True)) base_url = mkconf.Optional(mkconf.Type(str)) markdown_description = mkconf.Optional(mkconf.Type(str)) - absolute_link = mkconf.Type(bool, default=False) full_output = mkconf.Optional(mkconf.Type(str)) - index_file_name = mkconf.Type(str, default="index.md") sections = mkconf.DictOfItems( # Each list item can either be: # diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index d96d43a..ce8998a 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -124,14 +124,11 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None if (src_uri := page.file.src_uri) in self._file_uris: page_md = self._generate_page_markdown(html, page) - # Get base URL - base = self._get_base_url() - md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() # Apply the same logic as in the `Page.url` property. if md_url in (".", "./"): md_url = "" - md_url = urljoin(base, md_url) + md_url = urljoin(self._get_base_url(), md_url) self._md_pages[src_uri] = _MDPageInfo( title=str(page.title) if page.title is not None else src_uri, @@ -202,10 +199,11 @@ def _generate_page_markdown(self, html: str, page: Page) -> str: autoclean(soup) if self.config.preprocess: _preprocess(soup, self.config.preprocess, str(_get_page_md_path(page))) - if self.config.absolute_link: - base_uri = self._get_base_url() - current_dir = _get_parent_directory(page.file.dest_uri) - self._convert_to_absolute_links(soup, base_uri, current_dir) + + # Convert relative links to absolute links + base_uri = self._get_base_url() + current_dir = _get_parent_directory(page.file.dest_uri) + self._convert_to_absolute_links(soup, base_uri, current_dir) return mdformat.text( _converter.convert_soup(soup), @@ -250,9 +248,10 @@ def _convert_to_absolute_links(self, soup: Soup, base_uri: str, current_dir: str relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri final_href = urljoin(relative_base, href) - # If the final path ends with /, add index file name + # Convert directory paths (ending with /) to point to index.md files + # This represents the README.md of the directory if final_href.endswith("/"): - final_href = final_href + (self.config.index_file_name or "") + final_href = final_href + "index.md" link["href"] = final_href diff --git a/tests/test_plugin.py b/tests/test_plugin.py index cc5aa50..f7e2a91 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -17,7 +17,6 @@ "llmstxt": { "full_output": "llms-full.txt", "base_url": "https://example.org/en/0.1.34", - "absolute_link": True, "sections": { "Index": ["index.md"], "Usage": [{"page1.md": "Some usage docs."}], From b5e08450d589f38de435fa7124cac3c8b84572b3 Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Sat, 25 Oct 2025 23:16:40 +0900 Subject: [PATCH 07/12] refactor: Extract method to function --- src/mkdocs_llmstxt/_internal/plugin.py | 202 +++++++++++++------------ 1 file changed, 109 insertions(+), 93 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index ce8998a..c4e962c 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -53,6 +53,7 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]): mkdocs_config: MkDocsConfig """The global MkDocs configuration.""" + _base_url: str _sections: dict[str, dict[str, str]] _file_uris: set[str] _md_pages: dict[str, _MDPageInfo] @@ -88,6 +89,16 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None: if config.site_url is None: raise ValueError("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin") self.mkdocs_config = config + + # Use `base_url` if it exists. + if self.config.base_url is not None: + self._base_url = cast("str", self.config.base_url) + else: + # Use `site_url`, which we assume to be always specified. + self._base_url = cast("str", self.mkdocs_config.site_url) + if not self._base_url.endswith("/"): + self._base_url += "/" + return config def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None: # noqa: ARG002 @@ -122,17 +133,24 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None page: The page object. """ if (src_uri := page.file.src_uri) in self._file_uris: - page_md = self._generate_page_markdown(html, page) + path_md = Path(page.file.abs_dest_path).with_suffix(".md") + page_md = _generate_page_markdown( + html, + should_autoclean=self.config.autoclean, + preprocess=self.config.preprocess, + path=str(path_md), + base_uri=self._base_url, + ) md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() # Apply the same logic as in the `Page.url` property. if md_url in (".", "./"): md_url = "" - md_url = urljoin(self._get_base_url(), md_url) + md_url = urljoin(self._base_url, md_url) self._md_pages[src_uri] = _MDPageInfo( title=str(page.title) if page.title is not None else src_uri, - path_md=_get_page_md_path(page), + path_md=path_md, md_url=md_url, content=page_md, ) @@ -184,96 +202,6 @@ def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None: # noqa full_output_file.write_text(full_markdown, encoding="utf8") _logger.debug(f"Generated file /{self.config.full_output}.txt") - def _generate_page_markdown(self, html: str, page: Page) -> str: - """Convert HTML to Markdown. - - Parameters: - html: The HTML content. - page: The page object. - - Returns: - The Markdown content. - """ - soup = Soup(html, "html.parser") - if self.config.autoclean: - autoclean(soup) - if self.config.preprocess: - _preprocess(soup, self.config.preprocess, str(_get_page_md_path(page))) - - # Convert relative links to absolute links - base_uri = self._get_base_url() - current_dir = _get_parent_directory(page.file.dest_uri) - self._convert_to_absolute_links(soup, base_uri, current_dir) - - return mdformat.text( - _converter.convert_soup(soup), - options={"wrap": "no"}, - extensions=("tables",), - ) - - def _convert_to_absolute_links(self, soup: Soup, base_uri: str, current_dir: str) -> None: - """Handle links in the HTML. - - Parameters: - soup: The soup to modify. - base_uri: The base URI of the site. - current_dir: The current directory of the page (relative to site root). - """ - # Find all anchor tags with href attributes - for link in soup.find_all("a", href=True): - href = link.get("href") - - # Skip if href is not a string or is empty - if not isinstance(href, str) or not href: - continue - - # Skip if it's already an absolute URL (starts with http:// or https://) - if href.startswith(("http://", "https://")): - continue - - # Skip if it's a mailto: or other protocol links - if ":" in href and not href.startswith("/"): - continue - - # Skip if it's an anchor link (starts with #) - if href.startswith("#"): - continue - - # Convert relative link to absolute - if href.startswith("/"): - # Absolute path from site root - final_href = urljoin(base_uri, href) - else: - # Relative path from current directory - relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri - final_href = urljoin(relative_base, href) - - # Convert directory paths (ending with /) to point to index.md files - # This represents the README.md of the directory - if final_href.endswith("/"): - final_href = final_href + "index.md" - - link["href"] = final_href - - def _get_base_url(self) -> str: - if self.config.base_url is not None: - base_url = cast("str", self.config.base_url) - else: - base_url = cast("str", self.mkdocs_config.site_url) - if not base_url.endswith("/"): - base_url += "/" - return base_url - - -def _get_page_md_path(page: Page) -> Path: - return Path(page.file.abs_dest_path).with_suffix(".md") - - -def _get_parent_directory(dest_uri: str) -> str: - if dest_uri == ".": - return "" - return str(Path(dest_uri).parent) - def _language_callback(tag: Tag) -> str: for css_class in chain(tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()): @@ -288,3 +216,91 @@ def _language_callback(tag: Tag) -> str: escape_underscores=False, heading_style=ATX, ) + + +def _generate_page_markdown( + html: str, + *, + should_autoclean: bool, + preprocess: str | None, + path: str, + base_uri: str, +) -> str: + """Convert HTML to Markdown. + + Parameters: + html: The HTML content. + should_autoclean: Whether to autoclean the HTML. + preprocess: An optional path of a Python module containing a `preprocess` function. + path: The output path of the relevant Markdown file. + base_uri: The base URI of the site. + + Returns: + The Markdown content. + """ + soup = Soup(html, "html.parser") + if should_autoclean: + autoclean(soup) + if preprocess: + _preprocess(soup, preprocess, path) + + # Convert relative links to absolute links + current_dir = _get_parent_directory(path) + _convert_to_absolute_links(soup, base_uri, current_dir) + + return mdformat.text( + _converter.convert_soup(soup), + options={"wrap": "no"}, + extensions=("tables",), + ) + + +def _convert_to_absolute_links(soup: Soup, base_uri: str, current_dir: str) -> None: + """Handle links in the HTML. + + Parameters: + soup: The soup to modify. + base_uri: The base URI of the site. + current_dir: The current directory of the page (relative to site root). + """ + # Find all anchor tags with href attributes + for link in soup.find_all("a", href=True): + href = link.get("href") + + # Skip if href is not a string or is empty + if not isinstance(href, str) or not href: + continue + + # Skip if it's already an absolute URL (starts with http:// or https://) + if href.startswith(("http://", "https://")): + continue + + # Skip if it's a mailto: or other protocol links + if ":" in href and not href.startswith("/"): + continue + + # Skip if it's an anchor link (starts with #) + if href.startswith("#"): + continue + + # Convert relative link to absolute + if href.startswith("/"): + # Absolute path from site root + final_href = urljoin(base_uri, href) + else: + # Relative path from current directory + relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri + final_href = urljoin(relative_base, href) + + # Convert directory paths (ending with /) to point to index.md files + # This represents the README.md of the directory + if final_href.endswith("/"): + final_href = final_href + "index.md" + + link["href"] = final_href + + +def _get_parent_directory(dest_uri: str) -> str: + if dest_uri == ".": + return "" + return str(Path(dest_uri).parent) From 817522f32423309003fa5d394f84b34063fd17eb Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Sun, 26 Oct 2025 08:53:59 +0900 Subject: [PATCH 08/12] fix: Pass dest_uri --- src/mkdocs_llmstxt/_internal/plugin.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index c4e962c..84e2c54 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -140,6 +140,7 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None preprocess=self.config.preprocess, path=str(path_md), base_uri=self._base_url, + page_uri=page.file.dest_uri ) md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() @@ -225,6 +226,7 @@ def _generate_page_markdown( preprocess: str | None, path: str, base_uri: str, + page_uri: str ) -> str: """Convert HTML to Markdown. @@ -234,7 +236,7 @@ def _generate_page_markdown( preprocess: An optional path of a Python module containing a `preprocess` function. path: The output path of the relevant Markdown file. base_uri: The base URI of the site. - + page_uri: The destination URI of the page. Returns: The Markdown content. """ @@ -245,7 +247,7 @@ def _generate_page_markdown( _preprocess(soup, preprocess, path) # Convert relative links to absolute links - current_dir = _get_parent_directory(path) + current_dir = Path(page_uri).parent.as_posix() _convert_to_absolute_links(soup, base_uri, current_dir) return mdformat.text( @@ -297,10 +299,4 @@ def _convert_to_absolute_links(soup: Soup, base_uri: str, current_dir: str) -> N if final_href.endswith("/"): final_href = final_href + "index.md" - link["href"] = final_href - - -def _get_parent_directory(dest_uri: str) -> str: - if dest_uri == ".": - return "" - return str(Path(dest_uri).parent) + link["href"] = final_href \ No newline at end of file From 9f1420d125172dc3c05f5a84b314b1912f952328 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Mazzucotelli?= Date: Sun, 26 Oct 2025 11:12:56 +0100 Subject: [PATCH 09/12] wip --- src/mkdocs_llmstxt/_internal/plugin.py | 35 ++++++++++++-------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index 84e2c54..8ecd1e1 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -245,11 +245,7 @@ def _generate_page_markdown( autoclean(soup) if preprocess: _preprocess(soup, preprocess, path) - - # Convert relative links to absolute links - current_dir = Path(page_uri).parent.as_posix() - _convert_to_absolute_links(soup, base_uri, current_dir) - + _convert_to_absolute_links(soup, base_uri, page_uri) return mdformat.text( _converter.convert_soup(soup), options={"wrap": "no"}, @@ -257,46 +253,47 @@ def _generate_page_markdown( ) -def _convert_to_absolute_links(soup: Soup, base_uri: str, current_dir: str) -> None: - """Handle links in the HTML. +def _convert_to_absolute_links(soup: Soup, base_uri: str, page_uri: str) -> None: + """Convert relative links to absolute ones in the HTML. Parameters: soup: The soup to modify. base_uri: The base URI of the site. - current_dir: The current directory of the page (relative to site root). + page_uri: The destination URI of the page. """ - # Find all anchor tags with href attributes + current_dir = Path(page_uri).parent.as_posix() + + # Find all anchor tags with `href` attributes. for link in soup.find_all("a", href=True): href = link.get("href") - # Skip if href is not a string or is empty + # Skip if `href` is not a string or is empty. if not isinstance(href, str) or not href: continue - # Skip if it's already an absolute URL (starts with http:// or https://) + # Skip if it's already an absolute URL (starts with `http://` or `https://`). if href.startswith(("http://", "https://")): continue - # Skip if it's a mailto: or other protocol links + # Skip if it's a `mailto:` or other protocol links. if ":" in href and not href.startswith("/"): continue - # Skip if it's an anchor link (starts with #) + # Skip if it's an anchor link (starts with `#`). if href.startswith("#"): continue - # Convert relative link to absolute + # Convert relative link to absolute. if href.startswith("/"): - # Absolute path from site root + # Absolute path from site root. final_href = urljoin(base_uri, href) else: - # Relative path from current directory + # Relative path from current directory. relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri final_href = urljoin(relative_base, href) - # Convert directory paths (ending with /) to point to index.md files - # This represents the README.md of the directory + # Convert directory paths (ending with `/`) to point to `index.md` files. if final_href.endswith("/"): final_href = final_href + "index.md" - link["href"] = final_href \ No newline at end of file + link["href"] = final_href From 02bc5c5a0110bb4125c79e55467f02f3a95448bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Mazzucotelli?= Date: Sun, 26 Oct 2025 11:13:33 +0100 Subject: [PATCH 10/12] style: Format --- src/mkdocs_llmstxt/_internal/plugin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index 8ecd1e1..c926fce 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -140,7 +140,7 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None preprocess=self.config.preprocess, path=str(path_md), base_uri=self._base_url, - page_uri=page.file.dest_uri + page_uri=page.file.dest_uri, ) md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() @@ -226,7 +226,7 @@ def _generate_page_markdown( preprocess: str | None, path: str, base_uri: str, - page_uri: str + page_uri: str, ) -> str: """Convert HTML to Markdown. @@ -237,6 +237,7 @@ def _generate_page_markdown( path: The output path of the relevant Markdown file. base_uri: The base URI of the site. page_uri: The destination URI of the page. + Returns: The Markdown content. """ From 22bdbc96d15e6e0e1c3efacf4e574291a1c779eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Mazzucotelli?= Date: Sun, 26 Oct 2025 11:51:44 +0100 Subject: [PATCH 11/12] improve tests --- tests/test_plugin.py | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/tests/test_plugin.py b/tests/test_plugin.py index f7e2a91..7756ef1 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -1,6 +1,7 @@ """Tests for the plugin.""" from pathlib import Path +from textwrap import dedent import pytest from mkdocs.commands.build import build @@ -29,7 +30,18 @@ "pages": { "index.md": "# Hello world", "page1.md": "# Usage\n\nSome paragraph.", - "page2.md": "# Links\n\n[Relative link](../index.md)\n[Absolute link](/page1.md)\n[External link](https://example.com)\n[Anchor link](#section)", + "page2.md": dedent( + """ + # Links + + [Relative link 1](./index.md) + [Relative link 2](./page1.md) + [Absolute link 1](/en/0.1.34/index.md) + [Absolute link 2](/en/0.1.34/page1/index.md) + [External link](https://example.com) + [Anchor link](#section) + """, + ), }, }, ], @@ -59,13 +71,32 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None: assert page1md.exists() assert "Some paragraph." in page1md.read_text() - # Test relative link conversion page2md = Path(mkdocs_conf.site_dir, "page2/index.md") assert page2md.exists() page2md_content = page2md.read_text() - # Check that relative links are converted to absolute URLs - assert "https://example.org/en/0.1.34/index.md" in page2md_content # ../index.md converted - assert "https://example.org/page1.md" in page2md_content # /page1.md converted (absolute from domain root) + # Check that relative links are made absolute in each page and in the full llmstxt file. + assert "https://example.org/en/0.1.34/index.md" in page2md_content # ./index.md converted + assert ( + "https://example.org/en/0.1.34/page1/index.md" in page2md_content + ) # /en/0.1.34/page1.md converted (absolute from domain root) assert "https://example.com" in page2md_content # External link unchanged assert "#section" in page2md_content # Anchor link unchanged + + assert "https://example.org/en/0.1.34/index.md" in llmsfulltxt_content + assert "https://example.org/en/0.1.34/page1/index.md" in llmsfulltxt_content + assert "https://example.com" in llmsfulltxt_content + assert "#section" in llmsfulltxt_content + + # Check that llmstxt pages (Markdown) contain links to other llmstxt pages, not HTML ones. + assert '"https://example.org/en/0.1.34/index.html"' not in page2md_content + assert '"https://example.org/en/0.1.34/page1/"' not in page2md_content + assert '"https://example.org/en/0.1.34/page1/index.html"' not in page2md_content + + assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content + assert '"https://example.org/en/0.1.34/page1/"' not in llmsfulltxt_content + assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmsfulltxt_content + + assert '"https://example.org/en/0.1.34/index.html"' not in llmstxt_content + assert '"https://example.org/en/0.1.34/page1/"' not in llmstxt_content + assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmstxt_content From ea0681fe7f16187c8d6adfc4fca2d15ccb72f748 Mon Sep 17 00:00:00 2001 From: AlphaBs Date: Thu, 30 Oct 2025 11:43:05 +0900 Subject: [PATCH 12/12] fix: skip absolute path --- src/mkdocs_llmstxt/_internal/plugin.py | 29 +++++++++---------- tests/test_plugin.py | 40 +++++++++++++++----------- 2 files changed, 38 insertions(+), 31 deletions(-) diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index c926fce..bb075a7 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -6,7 +6,7 @@ from itertools import chain from pathlib import Path from typing import TYPE_CHECKING, NamedTuple, cast -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse import mdformat from bs4 import BeautifulSoup as Soup @@ -272,26 +272,25 @@ def _convert_to_absolute_links(soup: Soup, base_uri: str, page_uri: str) -> None if not isinstance(href, str) or not href: continue - # Skip if it's already an absolute URL (starts with `http://` or `https://`). - if href.startswith(("http://", "https://")): - continue - - # Skip if it's a `mailto:` or other protocol links. - if ":" in href and not href.startswith("/"): + # Skip if it's an absolute path + if href.startswith("/"): continue # Skip if it's an anchor link (starts with `#`). if href.startswith("#"): continue - # Convert relative link to absolute. - if href.startswith("/"): - # Absolute path from site root. - final_href = urljoin(base_uri, href) - else: - # Relative path from current directory. - relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri - final_href = urljoin(relative_base, href) + # Skip if it's an external link + try: + if urlparse(href).scheme: + continue + except ValueError: + # Invalid URL, skip + continue + + # Relative path from current directory. + relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri + final_href = urljoin(relative_base, href) # Convert directory paths (ending with `/`) to point to `index.md` files. if final_href.endswith("/"): diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 7756ef1..ff3d8d4 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -29,6 +29,7 @@ }, "pages": { "index.md": "# Hello world", + "dummy.md": "# Hello world", "page1.md": "# Usage\n\nSome paragraph.", "page2.md": dedent( """ @@ -36,10 +37,15 @@ [Relative link 1](./index.md) [Relative link 2](./page1.md) - [Absolute link 1](/en/0.1.34/index.md) - [Absolute link 2](/en/0.1.34/page1/index.md) + [Relative link 3](dummy.md) + [Absolute link 1](/abs1/) + [Absolute link 2](/abs2/index.md) [External link](https://example.com) [Anchor link](#section) + [Email link 1](mailto:test1@example.com) + + [External protocol 1](ftp://example1.com) + [External protocol 2](ftp://example2.com/my/) """, ), }, @@ -76,22 +82,24 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None: page2md_content = page2md.read_text() # Check that relative links are made absolute in each page and in the full llmstxt file. - assert "https://example.org/en/0.1.34/index.md" in page2md_content # ./index.md converted - assert ( - "https://example.org/en/0.1.34/page1/index.md" in page2md_content - ) # /en/0.1.34/page1.md converted (absolute from domain root) - assert "https://example.com" in page2md_content # External link unchanged - assert "#section" in page2md_content # Anchor link unchanged - - assert "https://example.org/en/0.1.34/index.md" in llmsfulltxt_content - assert "https://example.org/en/0.1.34/page1/index.md" in llmsfulltxt_content - assert "https://example.com" in llmsfulltxt_content - assert "#section" in llmsfulltxt_content + assert "(https://example.org/en/0.1.34/index.md)" in page2md_content + assert "(https://example.org/en/0.1.34/page1/index.md)" in page2md_content + assert "(https://example.org/en/0.1.34/dummy/index.md)" in page2md_content + + assert "(/abs1/)" in page2md_content # absolute link unchanged + assert "(/abs2/index.md)" in page2md_content # absolute link unchanged + + assert "(https://example.com)" in page2md_content # External link unchanged + assert "(#section)" in page2md_content # Anchor link unchanged + assert "(mailto:test1@example.com)" in page2md_content + assert "(mailto:test2@example.com)" in page2md_content + assert "(ftp://example1.com)" in page2md_content + assert "(ftp://example2.com/my/)" in page2md_content # index.md not included # Check that llmstxt pages (Markdown) contain links to other llmstxt pages, not HTML ones. - assert '"https://example.org/en/0.1.34/index.html"' not in page2md_content - assert '"https://example.org/en/0.1.34/page1/"' not in page2md_content - assert '"https://example.org/en/0.1.34/page1/index.html"' not in page2md_content + assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content + assert '"https://example.org/en/0.1.34/page1/"' not in llmsfulltxt_content + assert '"https://example.org/en/0.1.34/page1/index.html"' not in llmsfulltxt_content assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content assert '"https://example.org/en/0.1.34/page1/"' not in llmsfulltxt_content