Coverage for src/mkdocs_spellcheck/words.py: 94.87%
58 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-05 19:28 +0200
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-05 19:28 +0200
1"""This module contains a function to retrieve words from HTML text."""
3from __future__ import annotations
5import re
6import unicodedata
7from functools import partial
8from html.parser import HTMLParser
9from io import StringIO
12class _MLStripper(HTMLParser):
13 def __init__(self, ignore_code: bool = True) -> None: # noqa: FBT001,FBT002
14 super().__init__()
15 self.reset()
16 self.strict = False
17 self.convert_charrefs = True
18 self.text = StringIO()
19 self.ignore_code = ignore_code
20 self.in_code_tag = False
22 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: # noqa: ARG002
23 if tag == "code":
24 self.in_code_tag = True
25 self.text.write(" ")
27 def handle_endtag(self, tag: str) -> None:
28 if tag == "code":
29 self.in_code_tag = False
31 def handle_data(self, data: str) -> None:
32 if not (self.ignore_code and self.in_code_tag):
33 self.text.write(data)
35 def get_data(self) -> str:
36 return self.text.getvalue()
39def _strip_tags(html: str, ignore_code: bool) -> str: # noqa: FBT001
40 stripper = _MLStripper(ignore_code)
41 stripper.feed(html)
42 return stripper.get_data()
45not_letters_nor_spaces = re.compile(r"(?:(\B\'|\'\B|\B\'\B|\'s)|[^\w\s\'-])")
46dashes_or_spaces = re.compile(r"[-\s]+")
49def _normalize(value: str, allow_unicode: bool = False) -> str: # noqa: FBT001,FBT002
50 value = str(value)
51 if allow_unicode:
52 value = unicodedata.normalize("NFKC", value)
53 else:
54 value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
55 value = not_letters_nor_spaces.sub(" ", value)
56 return dashes_or_spaces.sub("-", value).strip("-_")
59def _keep_word(word: str, min_length: int, max_capital: int) -> bool:
60 if len(word) < min_length:
61 return False
62 capitals = 0
63 for char in word:
64 if char.isdigit(): 64 ↛ 65line 64 didn't jump to line 65, because the condition on line 64 was never true
65 return False
66 if char.isupper():
67 capitals += 1
68 if capitals > max_capital: 68 ↛ 69line 68 didn't jump to line 69, because the condition on line 68 was never true
69 return False
70 return True
73def get_words(
74 html: str,
75 *,
76 known_words: set[str] | None = None,
77 min_length: int = 2,
78 max_capital: int = 1,
79 ignore_code: bool = True,
80 allow_unicode: bool = True,
81) -> list[str]:
82 """Get words in HTML text.
84 Parameters:
85 html: The HTML text.
86 known_words: Words to exclude.
87 min_length: Words minimum length.
88 max_capital: Maximum number of capital letters.
89 ignore_code: Ignore words in code tags.
90 allow_unicode: Keep unicode characters.
92 Returns:
93 A list of words.
94 """
95 known_words = known_words or set()
96 keep = partial(_keep_word, min_length=min_length, max_capital=max_capital)
97 filtered = filter(keep, _normalize(_strip_tags(html, ignore_code), allow_unicode).split("-"))
98 words = {word.lower() for word in filtered}
99 return sorted(words - known_words)