from __future__ import annotations from html.parser import HTMLParser from aiogram.types import MessageEntity def utf16_len(value: str) -> int: return len(value.encode("utf-16-le")) // 2 class TelegramHTMLParser(HTMLParser): def __init__(self) -> None: super().__init__(convert_charrefs=True) self.text_parts: list[str] = [] self.entities: list[MessageEntity] = [] self.stack: list[dict] = [] @property def text(self) -> str: return "".join(self.text_parts) @property def entity_offset(self) -> int: return utf16_len(self.text) def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: attr_map = {key: value for key, value in attrs if value is not None} entry = {"tag": tag, "offset": self.entity_offset, "attrs": attr_map} self.stack.append(entry) def handle_endtag(self, tag: str) -> None: for index in range(len(self.stack) - 1, -1, -1): entry = self.stack[index] if entry["tag"] != tag: continue self.stack.pop(index) self._emit_entity(tag, entry["offset"], self.entity_offset - entry["offset"], entry["attrs"]) return def handle_data(self, data: str) -> None: self.text_parts.append(data) def _emit_entity(self, tag: str, offset: int, length: int, attrs: dict[str, str]) -> None: if length <= 0 and tag != "tg-emoji": return entity_type = None kwargs: dict[str, str] = {} if tag in {"b", "strong"}: entity_type = "bold" elif tag in {"i", "em"}: entity_type = "italic" elif tag == "u": entity_type = "underline" elif tag in {"s", "strike", "del"}: entity_type = "strikethrough" elif tag == "code": entity_type = "code" elif tag == "pre": entity_type = "pre" elif tag == "a": entity_type = "text_link" kwargs["url"] = attrs["href"] elif tag == "tg-spoiler": entity_type = "spoiler" elif tag == "blockquote": entity_type = "blockquote" elif tag == "tg-emoji": entity_type = "custom_emoji" kwargs["custom_emoji_id"] = attrs.get("emoji-id") or attrs.get("emoji_id", "") if length <= 0: return if entity_type is None: return self.entities.append( MessageEntity( type=entity_type, offset=offset, length=length, **kwargs, ) ) def html_to_text_entities(html: str) -> tuple[str, list[MessageEntity]]: parser = TelegramHTMLParser() parser.feed(html) parser.close() entities = sorted(parser.entities, key=lambda item: (item.offset, -item.length)) return parser.text, entities