Coverage for src/mkdocs_llmstxt/_internal/preprocess.py: 68.06%
50 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-08 13:40 +0200
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-08 13:40 +0200
1# HTML pre-processing.
3from __future__ import annotations
5import sys
6from importlib.util import module_from_spec, spec_from_file_location
7from typing import TYPE_CHECKING
9from bs4 import BeautifulSoup as Soup
10from bs4 import NavigableString
11from mkdocs.exceptions import PluginError
13if TYPE_CHECKING:
14 from types import ModuleType
16 from bs4 import Tag
19def _load_module(module_path: str) -> ModuleType:
20 module_name = module_path.rsplit("/", 1)[-1].rsplit(".", 1)[-1]
21 module_name = f"mkdocs_llmstxt.user_config.{module_name}"
22 spec = spec_from_file_location(module_name, module_path)
23 if spec and spec.loader:
24 module = module_from_spec(spec)
25 sys.modules[module_name] = module
26 spec.loader.exec_module(module)
27 return module
28 raise RuntimeError("Spec or loader is null")
31def _preprocess(soup: Soup, module_path: str, output: str) -> None:
32 """Pre-process HTML with user-defined functions.
34 Parameters:
35 soup: The HTML (soup) to process before conversion to Markdown.
36 module_path: The path of a Python module containing a `preprocess` function.
37 The function must accept one and only one argument called `soup`.
38 The `soup` argument is an instance of [`bs4.BeautifulSoup`][].
39 output: The output path of the relevant Markdown file.
41 Returns:
42 The processed HTML.
43 """
44 try:
45 module = _load_module(module_path)
46 except Exception as error:
47 raise PluginError(f"Could not load module: {error}") from error
48 try:
49 module.preprocess(soup, output)
50 except Exception as error:
51 raise PluginError(f"Could not pre-process HTML: {error}") from error
54def _to_remove(tag: Tag) -> bool:
55 # Remove images and SVGs.
56 if tag.name in {"img", "svg"}:
57 return True
58 # Remove links containing images or SVGs.
59 if tag.name == "a" and tag.img and _to_remove(tag.img):
60 return True
62 classes = tag.get("class") or ()
64 # Remove permalinks.
65 if tag.name == "a" and "headerlink" in classes:
66 return True
67 # Remove Twemojis.
68 if "twemoji" in classes: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 return True
70 # Remove tab labels.
71 if "tabbed-labels" in classes: # noqa: SIM103 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 return True
74 return False
77def autoclean(soup: Soup) -> None:
78 """Auto-clean the soup by removing elements.
80 Parameters:
81 soup: The soup to modify.
82 """
83 # Remove unwanted elements.
84 for element in soup.find_all(_to_remove):
85 element.decompose()
87 # Unwrap autoref elements.
88 for element in soup.find_all("autoref"):
89 element.replace_with(NavigableString(element.get_text()))
91 # Unwrap mkdocstrings div.doc-md-description.
92 for element in soup.find_all("div", attrs={"class": "doc-md-description"}):
93 element.replace_with(NavigableString(element.get_text().strip()))
95 # Remove mkdocstrings labels.
96 for element in soup.find_all("span", attrs={"class": "doc-labels"}):
97 element.decompose()
99 # Remove line numbers from code blocks.
100 for element in soup.find_all("table", attrs={"class": "highlighttable"}):
101 element.replace_with(Soup(f"<pre>{element.find('code').get_text()}</pre>", "html.parser")) # type: ignore[union-attr]