Coverage for src/mkdocs_spellcheck/words.py: 94.87%

1"""This module contains a function to retrieve words from HTML text."""

3from __future__ import annotations

5import re

6import unicodedata

7from functools import partial

8from html.parser import HTMLParser

9from io import StringIO

12class _MLStripper(HTMLParser):

13 def __init__(self, ignore_code: bool = True) -> None: # noqa: FBT001,FBT002

14 super().__init__()

15 self.reset()

16 self.strict = False

17 self.convert_charrefs = True

18 self.text = StringIO()

19 self.ignore_code = ignore_code

20 self.in_code_tag = False

22 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: # noqa: ARG002

23 if tag == "code":

24 self.in_code_tag = True

25 self.text.write(" ")

27 def handle_endtag(self, tag: str) -> None:

28 if tag == "code":

29 self.in_code_tag = False

31 def handle_data(self, data: str) -> None:

32 if not (self.ignore_code and self.in_code_tag):

33 self.text.write(data)

35 def get_data(self) -> str:

36 return self.text.getvalue()

39def _strip_tags(html: str, ignore_code: bool) -> str: # noqa: FBT001

40 stripper = _MLStripper(ignore_code)

41 stripper.feed(html)

42 return stripper.get_data()

45not_letters_nor_spaces = re.compile(r"(?:(\B\'|\'\B|\B\'\B|\'s)|[^\w\s\'-])")

46dashes_or_spaces = re.compile(r"[-\s]+")

49def _normalize(value: str, allow_unicode: bool = False) -> str: # noqa: FBT001,FBT002

50 value = str(value)

51 if allow_unicode:

52 value = unicodedata.normalize("NFKC", value)

53 else:

54 value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")

55 value = not_letters_nor_spaces.sub(" ", value)

56 return dashes_or_spaces.sub("-", value).strip("-_")

59def _keep_word(word: str, min_length: int, max_capital: int) -> bool:

60 if len(word) < min_length:

61 return False

62 capitals = 0

63 for char in word:

64 if char.isdigit(): 64 ↛ 65line 64 didn't jump to line 65, because the condition on line 64 was never true

65 return False

66 if char.isupper():

67 capitals += 1

68 if capitals > max_capital: 68 ↛ 69line 68 didn't jump to line 69, because the condition on line 68 was never true

69 return False

70 return True

73def get_words(

74 html: str,

75 *,

76 known_words: set[str] | None = None,

77 min_length: int = 2,

78 max_capital: int = 1,

79 ignore_code: bool = True,

80 allow_unicode: bool = True,

81) -> list[str]:

82 """Get words in HTML text.

84 Parameters:

85 html: The HTML text.

86 known_words: Words to exclude.

87 min_length: Words minimum length.

88 max_capital: Maximum number of capital letters.

89 ignore_code: Ignore words in code tags.

90 allow_unicode: Keep unicode characters.

92 Returns:

93 A list of words.

94 """

95 known_words = known_words or set()

96 keep = partial(_keep_word, min_length=min_length, max_capital=max_capital)

97 filtered = filter(keep, _normalize(_strip_tags(html, ignore_code), allow_unicode).split("-"))

98 words = {word.lower() for word in filtered}

99 return sorted(words - known_words)