Coverage for src/mkdocs_spellcheck/words.py: 94.87%

58 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-05-05 19:28 +0200

1"""This module contains a function to retrieve words from HTML text.""" 

2 

3from __future__ import annotations 

4 

5import re 

6import unicodedata 

7from functools import partial 

8from html.parser import HTMLParser 

9from io import StringIO 

10 

11 

12class _MLStripper(HTMLParser): 

13 def __init__(self, ignore_code: bool = True) -> None: # noqa: FBT001,FBT002 

14 super().__init__() 

15 self.reset() 

16 self.strict = False 

17 self.convert_charrefs = True 

18 self.text = StringIO() 

19 self.ignore_code = ignore_code 

20 self.in_code_tag = False 

21 

22 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: # noqa: ARG002 

23 if tag == "code": 

24 self.in_code_tag = True 

25 self.text.write(" ") 

26 

27 def handle_endtag(self, tag: str) -> None: 

28 if tag == "code": 

29 self.in_code_tag = False 

30 

31 def handle_data(self, data: str) -> None: 

32 if not (self.ignore_code and self.in_code_tag): 

33 self.text.write(data) 

34 

35 def get_data(self) -> str: 

36 return self.text.getvalue() 

37 

38 

39def _strip_tags(html: str, ignore_code: bool) -> str: # noqa: FBT001 

40 stripper = _MLStripper(ignore_code) 

41 stripper.feed(html) 

42 return stripper.get_data() 

43 

44 

45not_letters_nor_spaces = re.compile(r"(?:(\B\'|\'\B|\B\'\B|\'s)|[^\w\s\'-])") 

46dashes_or_spaces = re.compile(r"[-\s]+") 

47 

48 

49def _normalize(value: str, allow_unicode: bool = False) -> str: # noqa: FBT001,FBT002 

50 value = str(value) 

51 if allow_unicode: 

52 value = unicodedata.normalize("NFKC", value) 

53 else: 

54 value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii") 

55 value = not_letters_nor_spaces.sub(" ", value) 

56 return dashes_or_spaces.sub("-", value).strip("-_") 

57 

58 

59def _keep_word(word: str, min_length: int, max_capital: int) -> bool: 

60 if len(word) < min_length: 

61 return False 

62 capitals = 0 

63 for char in word: 

64 if char.isdigit(): 64 ↛ 65line 64 didn't jump to line 65, because the condition on line 64 was never true

65 return False 

66 if char.isupper(): 

67 capitals += 1 

68 if capitals > max_capital: 68 ↛ 69line 68 didn't jump to line 69, because the condition on line 68 was never true

69 return False 

70 return True 

71 

72 

73def get_words( 

74 html: str, 

75 *, 

76 known_words: set[str] | None = None, 

77 min_length: int = 2, 

78 max_capital: int = 1, 

79 ignore_code: bool = True, 

80 allow_unicode: bool = True, 

81) -> list[str]: 

82 """Get words in HTML text. 

83 

84 Parameters: 

85 html: The HTML text. 

86 known_words: Words to exclude. 

87 min_length: Words minimum length. 

88 max_capital: Maximum number of capital letters. 

89 ignore_code: Ignore words in code tags. 

90 allow_unicode: Keep unicode characters. 

91 

92 Returns: 

93 A list of words. 

94 """ 

95 known_words = known_words or set() 

96 keep = partial(_keep_word, min_length=min_length, max_capital=max_capital) 

97 filtered = filter(keep, _normalize(_strip_tags(html, ignore_code), allow_unicode).split("-")) 

98 words = {word.lower() for word in filtered} 

99 return sorted(words - known_words)