import html
import re
from typing import Dict, Union


class CreditHtmlIdentityService:
    """从征信样本 HTML 中提取员工姓名和身份证号。"""

    _META_TEMPLATE = r'<meta\s+name="{name}"\s+content="(?P<value>[^"]+)"\s*/?>'

    def extract_identity(self, html_content: Union[str, bytes]) -> Dict[str, str]:
        text = self._normalize_html(html_content)
        staff_name = self._extract_meta_content(text, "ccdi-staff-name")
        staff_id_card = self._extract_meta_content(text, "ccdi-staff-id-card")

        identity: Dict[str, str] = {}
        if staff_name:
            identity["staff_name"] = staff_name
        if staff_id_card:
            identity["staff_id_card"] = staff_id_card
        return identity

    def _extract_meta_content(self, text: str, meta_name: str) -> str:
        matched = re.search(
            self._META_TEMPLATE.format(name=re.escape(meta_name)),
            text,
            flags=re.IGNORECASE,
        )
        if not matched:
            return ""
        return html.unescape(matched.group("value")).strip()

    @staticmethod
    def _normalize_html(html_content: Union[str, bytes]) -> str:
        if isinstance(html_content, bytes):
            return html_content.decode("utf-8", errors="ignore")
        return html_content