Coverage for src/mkdocs_llmstxt/_internal/preprocess.py: 68.06%

1# HTML pre-processing.

3from __future__ import annotations

5import sys

6from importlib.util import module_from_spec, spec_from_file_location

7from typing import TYPE_CHECKING

9from bs4 import BeautifulSoup as Soup

10from bs4 import NavigableString

11from mkdocs.exceptions import PluginError

13if TYPE_CHECKING:

14 from types import ModuleType

16 from bs4 import Tag

19def _load_module(module_path: str) -> ModuleType:

20 module_name = module_path.rsplit("/", 1)[-1].rsplit(".", 1)[-1]

21 module_name = f"mkdocs_llmstxt.user_config.{module_name}"

22 spec = spec_from_file_location(module_name, module_path)

23 if spec and spec.loader:

24 module = module_from_spec(spec)

25 sys.modules[module_name] = module

26 spec.loader.exec_module(module)

27 return module

28 raise RuntimeError("Spec or loader is null")

31def _preprocess(soup: Soup, module_path: str, output: str) -> None:

32 """Pre-process HTML with user-defined functions.

34 Parameters:

35 soup: The HTML (soup) to process before conversion to Markdown.

36 module_path: The path of a Python module containing a `preprocess` function.

37 The function must accept one and only one argument called `soup`.

38 The `soup` argument is an instance of [`bs4.BeautifulSoup`][].

39 output: The output path of the relevant Markdown file.

41 Returns:

42 The processed HTML.

43 """

44 try:

45 module = _load_module(module_path)

46 except Exception as error:

47 raise PluginError(f"Could not load module: {error}") from error

48 try:

49 module.preprocess(soup, output)

50 except Exception as error:

51 raise PluginError(f"Could not pre-process HTML: {error}") from error

54def _to_remove(tag: Tag) -> bool:

55 # Remove images and SVGs.

56 if tag.name in {"img", "svg"}:

57 return True

58 # Remove links containing images or SVGs.

59 if tag.name == "a" and tag.img and _to_remove(tag.img):

60 return True

62 classes = tag.get("class") or ()

64 # Remove permalinks.

65 if tag.name == "a" and "headerlink" in classes:

66 return True

67 # Remove Twemojis.

68 if "twemoji" in classes: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 return True

70 # Remove tab labels.

71 if "tabbed-labels" in classes: # noqa: SIM103 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 return True

74 return False

77def autoclean(soup: Soup) -> None:

78 """Auto-clean the soup by removing elements.

80 Parameters:

81 soup: The soup to modify.

82 """

83 # Remove unwanted elements.

84 for element in soup.find_all(_to_remove):

85 element.decompose()

87 # Unwrap autoref elements.

88 for element in soup.find_all("autoref"):

89 element.replace_with(NavigableString(element.get_text()))

91 # Unwrap mkdocstrings div.doc-md-description.

92 for element in soup.find_all("div", attrs={"class": "doc-md-description"}):

93 element.replace_with(NavigableString(element.get_text().strip()))

95 # Remove mkdocstrings labels.

96 for element in soup.find_all("span", attrs={"class": "doc-labels"}):

97 element.decompose()

99 # Remove line numbers from code blocks.

100 for element in soup.find_all("table", attrs={"class": "highlighttable"}):

101 element.replace_with(Soup(f"<pre>{element.find('code').get_text()}</pre>", "html.parser")) # type: ignore[union-attr]