Coverage for src/mkdocs_llmstxt/_internal/plugin.py: 90.23%
97 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-08 13:40 +0200
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-08 13:40 +0200
1# MkDocs plugin that generates a Markdown file at the end of the build.
3from __future__ import annotations
5import fnmatch
6from itertools import chain
7from pathlib import Path
8from typing import TYPE_CHECKING, NamedTuple, cast
9from urllib.parse import urljoin
11import mdformat
12from bs4 import BeautifulSoup as Soup
13from bs4 import Tag
14from markdownify import ATX, MarkdownConverter
15from mkdocs.config.defaults import MkDocsConfig
16from mkdocs.plugins import BasePlugin
17from mkdocs.structure.pages import Page
19from mkdocs_llmstxt._internal.config import _PluginConfig
20from mkdocs_llmstxt._internal.logger import _get_logger
21from mkdocs_llmstxt._internal.preprocess import _preprocess, autoclean
23if TYPE_CHECKING:
24 from typing import Any
26 from mkdocs.config.defaults import MkDocsConfig
27 from mkdocs.structure.files import Files
28 from mkdocs.structure.pages import Page
31_logger = _get_logger(__name__)
34class _MDPageInfo(NamedTuple):
35 title: str
36 path_md: Path
37 md_url: str
38 content: str
41class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
42 """The MkDocs plugin to generate an `llms.txt` file.
44 This plugin defines the following event hooks:
46 - `on_page_content`
47 - `on_post_build`
49 Check the [Developing Plugins](https://www.mkdocs.org/user-guide/plugins/#developing-plugins) page of `mkdocs`
50 for more information about its plugin system.
51 """
53 mkdocs_config: MkDocsConfig
54 """The global MkDocs configuration."""
56 md_pages: dict[str, list[_MDPageInfo]]
57 """Dictionary mapping section names to a list of page infos."""
59 def _expand_inputs(self, inputs: list[str], page_uris: list[str]) -> list[str]:
60 expanded: list[str] = []
61 for input_file in inputs:
62 if "*" in input_file:
63 expanded.extend(fnmatch.filter(page_uris, input_file))
64 else:
65 expanded.append(input_file)
66 return expanded
68 def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None:
69 """Save the global MkDocs configuration.
71 Hook for the [`on_config` event](https://www.mkdocs.org/user-guide/plugins/#on_config).
72 In this hook, we save the global MkDocs configuration into an instance variable,
73 to re-use it later.
75 Arguments:
76 config: The MkDocs config object.
78 Returns:
79 The same, untouched config.
80 """
81 if config.site_url is None: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 raise ValueError("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin")
83 self.mkdocs_config = config
84 # A `defaultdict` could be used, but we need to retain the same order between `config.sections` and `md_pages`
85 # (which wouldn't be guaranteed when filling `md_pages` in `on_page_content()`).
86 self.md_pages = {section: [] for section in self.config.sections}
87 return config
89 def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None: # noqa: ARG002
90 """Expand inputs for generated files.
92 Hook for the [`on_files` event](https://www.mkdocs.org/user-guide/plugins/#on_files).
93 In this hook we expand inputs for generated file (glob patterns using `*`).
95 Parameters:
96 files: The collection of MkDocs files.
97 config: The MkDocs configuration.
99 Returns:
100 Modified collection or none.
101 """
102 page_uris = list(files.src_uris)
104 for section_name, file_list in list(self.config.sections.items()):
105 self.config.sections[section_name] = self._expand_inputs(file_list, page_uris=page_uris)
107 return files
109 def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None: # noqa: ARG002
110 """Convert page content into a Markdown file and save the result to be processed in the `on_post_build` hook.
112 Hook for the [`on_page_content` event](https://www.mkdocs.org/user-guide/plugins/#on_page_content).
114 Parameters:
115 html: The rendered HTML.
116 page: The page object.
117 """
118 for section_name, file_list in self.config.sections.items():
119 if page.file.src_uri in file_list:
120 path_md = Path(page.file.abs_dest_path).with_suffix(".md")
121 page_md = _generate_page_markdown(
122 html,
123 should_autoclean=self.config.autoclean,
124 preprocess=self.config.preprocess,
125 path=str(path_md),
126 )
128 md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix()
129 # Apply the same logic as in the `Page.url` property.
130 if md_url in (".", "./"): 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true
131 md_url = ""
133 # Guaranteed to exist as we require `site_url` to be configured.
134 base = cast("str", self.mkdocs_config.site_url)
135 if not base.endswith("/"): 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 base += "/"
137 md_url = urljoin(base, md_url)
139 self.md_pages[section_name].append(
140 _MDPageInfo(
141 title=page.title if page.title is not None else page.file.src_uri,
142 path_md=path_md,
143 md_url=md_url,
144 content=page_md,
145 ),
146 )
148 return html
150 def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None: # noqa: ARG002
151 """Create the final `llms.txt` file and the MD files for all selected pages.
153 Hook for the [`on_post_build` event](https://www.mkdocs.org/user-guide/plugins/#on_post_build).
155 Parameters:
156 config: MkDocs configuration.
157 """
158 output_file = Path(config.site_dir).joinpath("llms.txt")
159 output_file.parent.mkdir(parents=True, exist_ok=True)
160 markdown = f"# {config.site_name}\n\n"
162 if config.site_description is not None: 162 ↛ 165line 162 didn't jump to line 165 because the condition on line 162 was always true
163 markdown += f"> {config.site_description}\n\n"
165 if self.config.markdown_description is not None: 165 ↛ 168line 165 didn't jump to line 168 because the condition on line 165 was always true
166 markdown += f"{self.config.markdown_description}\n\n"
168 full_markdown = markdown
170 for section_name, file_list in self.md_pages.items():
171 markdown += f"## {section_name}\n\n"
172 for page_title, path_md, md_url, content in file_list:
173 path_md.write_text(content, encoding="utf8")
174 _logger.debug(f"Generated MD file to {path_md}")
175 markdown += f"- [{page_title}]({md_url})\n"
176 markdown += "\n"
178 output_file.write_text(markdown, encoding="utf8")
179 _logger.debug("Generated file /llms.txt")
181 if self.config.full_output is not None: 181 ↛ exitline 181 didn't return from function 'on_post_build' because the condition on line 181 was always true
182 full_output_file = Path(config.site_dir).joinpath(self.config.full_output)
183 for section_name, file_list in self.md_pages.items():
184 list_content = "\n".join(info.content for info in file_list)
185 full_markdown += f"# {section_name}\n\n{list_content}"
186 full_output_file.write_text(full_markdown, encoding="utf8")
187 _logger.debug(f"Generated file /{self.config.full_output}.txt")
190def _language_callback(tag: Tag) -> str:
191 for css_class in chain(tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()):
192 if css_class.startswith("language-"): 192 ↛ 191line 192 didn't jump to line 191 because the condition on line 192 was always true
193 return css_class[9:]
194 return ""
197_converter = MarkdownConverter(
198 bullets="-",
199 code_language_callback=_language_callback,
200 escape_underscores=False,
201 heading_style=ATX,
202)
205def _generate_page_markdown(
206 html: str,
207 *,
208 should_autoclean: bool,
209 preprocess: str | None,
210 path: str,
211) -> str:
212 """Convert HTML to Markdown.
214 Parameters:
215 html: The HTML content.
216 should_autoclean: Whether to autoclean the HTML.
217 preprocess: An optional path of a Python module containing a `preprocess` function.
218 path: The output path of the relevant Markdown file.
220 Returns:
221 The Markdown content.
222 """
223 soup = Soup(html, "html.parser")
224 if should_autoclean: 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true
225 autoclean(soup)
226 if preprocess: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true
227 _preprocess(soup, preprocess, path)
228 return mdformat.text(_converter.convert_soup(soup), options={"wrap": "no"})