import html import re from typing import Dict, Union class CreditHtmlIdentityService: """从征信样本 HTML 中提取员工姓名和身份证号。""" _META_TEMPLATE = r'[^"]+)"\s*/?>' def extract_identity(self, html_content: Union[str, bytes]) -> Dict[str, str]: text = self._normalize_html(html_content) staff_name = self._extract_meta_content(text, "ccdi-staff-name") staff_id_card = self._extract_meta_content(text, "ccdi-staff-id-card") identity: Dict[str, str] = {} if staff_name: identity["staff_name"] = staff_name if staff_id_card: identity["staff_id_card"] = staff_id_card return identity def _extract_meta_content(self, text: str, meta_name: str) -> str: matched = re.search( self._META_TEMPLATE.format(name=re.escape(meta_name)), text, flags=re.IGNORECASE, ) if not matched: return "" return html.unescape(matched.group("value")).strip() @staticmethod def _normalize_html(html_content: Union[str, bytes]) -> str: if isinstance(html_content, bytes): return html_content.decode("utf-8", errors="ignore") return html_content