import html
import re
from typing import Dict, Union
class CreditHtmlIdentityService:
"""从征信样本 HTML 中提取员工姓名和身份证号。"""
_META_TEMPLATE = r'[^"]+)"\s*/?>'
def extract_identity(self, html_content: Union[str, bytes]) -> Dict[str, str]:
text = self._normalize_html(html_content)
staff_name = self._extract_meta_content(text, "ccdi-staff-name")
staff_id_card = self._extract_meta_content(text, "ccdi-staff-id-card")
identity: Dict[str, str] = {}
if staff_name:
identity["staff_name"] = staff_name
if staff_id_card:
identity["staff_id_card"] = staff_id_card
return identity
def _extract_meta_content(self, text: str, meta_name: str) -> str:
matched = re.search(
self._META_TEMPLATE.format(name=re.escape(meta_name)),
text,
flags=re.IGNORECASE,
)
if not matched:
return ""
return html.unescape(matched.group("value")).strip()
@staticmethod
def _normalize_html(html_content: Union[str, bytes]) -> str:
if isinstance(html_content, bytes):
return html_content.decode("utf-8", errors="ignore")
return html_content