Coverage for src/mkdocs_llmstxt/_internal/plugin.py: 90.23%

97 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-08 13:40 +0200

1# MkDocs plugin that generates a Markdown file at the end of the build. 

2 

3from __future__ import annotations 

4 

5import fnmatch 

6from itertools import chain 

7from pathlib import Path 

8from typing import TYPE_CHECKING, NamedTuple, cast 

9from urllib.parse import urljoin 

10 

11import mdformat 

12from bs4 import BeautifulSoup as Soup 

13from bs4 import Tag 

14from markdownify import ATX, MarkdownConverter 

15from mkdocs.config.defaults import MkDocsConfig 

16from mkdocs.plugins import BasePlugin 

17from mkdocs.structure.pages import Page 

18 

19from mkdocs_llmstxt._internal.config import _PluginConfig 

20from mkdocs_llmstxt._internal.logger import _get_logger 

21from mkdocs_llmstxt._internal.preprocess import _preprocess, autoclean 

22 

23if TYPE_CHECKING: 

24 from typing import Any 

25 

26 from mkdocs.config.defaults import MkDocsConfig 

27 from mkdocs.structure.files import Files 

28 from mkdocs.structure.pages import Page 

29 

30 

31_logger = _get_logger(__name__) 

32 

33 

34class _MDPageInfo(NamedTuple): 

35 title: str 

36 path_md: Path 

37 md_url: str 

38 content: str 

39 

40 

41class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]): 

42 """The MkDocs plugin to generate an `llms.txt` file. 

43 

44 This plugin defines the following event hooks: 

45 

46 - `on_page_content` 

47 - `on_post_build` 

48 

49 Check the [Developing Plugins](https://www.mkdocs.org/user-guide/plugins/#developing-plugins) page of `mkdocs` 

50 for more information about its plugin system. 

51 """ 

52 

53 mkdocs_config: MkDocsConfig 

54 """The global MkDocs configuration.""" 

55 

56 md_pages: dict[str, list[_MDPageInfo]] 

57 """Dictionary mapping section names to a list of page infos.""" 

58 

59 def _expand_inputs(self, inputs: list[str], page_uris: list[str]) -> list[str]: 

60 expanded: list[str] = [] 

61 for input_file in inputs: 

62 if "*" in input_file: 

63 expanded.extend(fnmatch.filter(page_uris, input_file)) 

64 else: 

65 expanded.append(input_file) 

66 return expanded 

67 

68 def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None: 

69 """Save the global MkDocs configuration. 

70 

71 Hook for the [`on_config` event](https://www.mkdocs.org/user-guide/plugins/#on_config). 

72 In this hook, we save the global MkDocs configuration into an instance variable, 

73 to re-use it later. 

74 

75 Arguments: 

76 config: The MkDocs config object. 

77 

78 Returns: 

79 The same, untouched config. 

80 """ 

81 if config.site_url is None: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 raise ValueError("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin") 

83 self.mkdocs_config = config 

84 # A `defaultdict` could be used, but we need to retain the same order between `config.sections` and `md_pages` 

85 # (which wouldn't be guaranteed when filling `md_pages` in `on_page_content()`). 

86 self.md_pages = {section: [] for section in self.config.sections} 

87 return config 

88 

89 def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None: # noqa: ARG002 

90 """Expand inputs for generated files. 

91 

92 Hook for the [`on_files` event](https://www.mkdocs.org/user-guide/plugins/#on_files). 

93 In this hook we expand inputs for generated file (glob patterns using `*`). 

94 

95 Parameters: 

96 files: The collection of MkDocs files. 

97 config: The MkDocs configuration. 

98 

99 Returns: 

100 Modified collection or none. 

101 """ 

102 page_uris = list(files.src_uris) 

103 

104 for section_name, file_list in list(self.config.sections.items()): 

105 self.config.sections[section_name] = self._expand_inputs(file_list, page_uris=page_uris) 

106 

107 return files 

108 

109 def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None: # noqa: ARG002 

110 """Convert page content into a Markdown file and save the result to be processed in the `on_post_build` hook. 

111 

112 Hook for the [`on_page_content` event](https://www.mkdocs.org/user-guide/plugins/#on_page_content). 

113 

114 Parameters: 

115 html: The rendered HTML. 

116 page: The page object. 

117 """ 

118 for section_name, file_list in self.config.sections.items(): 

119 if page.file.src_uri in file_list: 

120 path_md = Path(page.file.abs_dest_path).with_suffix(".md") 

121 page_md = _generate_page_markdown( 

122 html, 

123 should_autoclean=self.config.autoclean, 

124 preprocess=self.config.preprocess, 

125 path=str(path_md), 

126 ) 

127 

128 md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix() 

129 # Apply the same logic as in the `Page.url` property. 

130 if md_url in (".", "./"): 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 md_url = "" 

132 

133 # Guaranteed to exist as we require `site_url` to be configured. 

134 base = cast("str", self.mkdocs_config.site_url) 

135 if not base.endswith("/"): 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 base += "/" 

137 md_url = urljoin(base, md_url) 

138 

139 self.md_pages[section_name].append( 

140 _MDPageInfo( 

141 title=page.title if page.title is not None else page.file.src_uri, 

142 path_md=path_md, 

143 md_url=md_url, 

144 content=page_md, 

145 ), 

146 ) 

147 

148 return html 

149 

150 def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None: # noqa: ARG002 

151 """Create the final `llms.txt` file and the MD files for all selected pages. 

152 

153 Hook for the [`on_post_build` event](https://www.mkdocs.org/user-guide/plugins/#on_post_build). 

154 

155 Parameters: 

156 config: MkDocs configuration. 

157 """ 

158 output_file = Path(config.site_dir).joinpath("llms.txt") 

159 output_file.parent.mkdir(parents=True, exist_ok=True) 

160 markdown = f"# {config.site_name}\n\n" 

161 

162 if config.site_description is not None: 162 ↛ 165line 162 didn't jump to line 165 because the condition on line 162 was always true

163 markdown += f"> {config.site_description}\n\n" 

164 

165 if self.config.markdown_description is not None: 165 ↛ 168line 165 didn't jump to line 168 because the condition on line 165 was always true

166 markdown += f"{self.config.markdown_description}\n\n" 

167 

168 full_markdown = markdown 

169 

170 for section_name, file_list in self.md_pages.items(): 

171 markdown += f"## {section_name}\n\n" 

172 for page_title, path_md, md_url, content in file_list: 

173 path_md.write_text(content, encoding="utf8") 

174 _logger.debug(f"Generated MD file to {path_md}") 

175 markdown += f"- [{page_title}]({md_url})\n" 

176 markdown += "\n" 

177 

178 output_file.write_text(markdown, encoding="utf8") 

179 _logger.debug("Generated file /llms.txt") 

180 

181 if self.config.full_output is not None: 181 ↛ exitline 181 didn't return from function 'on_post_build' because the condition on line 181 was always true

182 full_output_file = Path(config.site_dir).joinpath(self.config.full_output) 

183 for section_name, file_list in self.md_pages.items(): 

184 list_content = "\n".join(info.content for info in file_list) 

185 full_markdown += f"# {section_name}\n\n{list_content}" 

186 full_output_file.write_text(full_markdown, encoding="utf8") 

187 _logger.debug(f"Generated file /{self.config.full_output}.txt") 

188 

189 

190def _language_callback(tag: Tag) -> str: 

191 for css_class in chain(tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()): 

192 if css_class.startswith("language-"): 192 ↛ 191line 192 didn't jump to line 191 because the condition on line 192 was always true

193 return css_class[9:] 

194 return "" 

195 

196 

197_converter = MarkdownConverter( 

198 bullets="-", 

199 code_language_callback=_language_callback, 

200 escape_underscores=False, 

201 heading_style=ATX, 

202) 

203 

204 

205def _generate_page_markdown( 

206 html: str, 

207 *, 

208 should_autoclean: bool, 

209 preprocess: str | None, 

210 path: str, 

211) -> str: 

212 """Convert HTML to Markdown. 

213 

214 Parameters: 

215 html: The HTML content. 

216 should_autoclean: Whether to autoclean the HTML. 

217 preprocess: An optional path of a Python module containing a `preprocess` function. 

218 path: The output path of the relevant Markdown file. 

219 

220 Returns: 

221 The Markdown content. 

222 """ 

223 soup = Soup(html, "html.parser") 

224 if should_autoclean: 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true

225 autoclean(soup) 

226 if preprocess: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true

227 _preprocess(soup, preprocess, path) 

228 return mdformat.text(_converter.convert_soup(soup), options={"wrap": "no"})