lsfx-mock-server/services/credit_html_identity_service.py

import html
import re
from typing import Dict, Union


class CreditHtmlIdentityService:
    """从征信样本 HTML 中提取员工姓名和身份证号。"""

    _META_TEMPLATE = r'<meta\s+name="{name}"\s+content="(?P<value>[^"]+)"\s*/?>'

    def extract_identity(self, html_content: Union[str, bytes]) -> Dict[str, str]:
        text = self._normalize_html(html_content)
        staff_name = self._extract_meta_content(text, "ccdi-staff-name")
        staff_id_card = self._extract_meta_content(text, "ccdi-staff-id-card")

        identity: Dict[str, str] = {}
        if staff_name:
            identity["staff_name"] = staff_name
        if staff_id_card:
            identity["staff_id_card"] = staff_id_card
        return identity

    def _extract_meta_content(self, text: str, meta_name: str) -> str:
        matched = re.search(
            self._META_TEMPLATE.format(name=re.escape(meta_name)),
            text,
            flags=re.IGNORECASE,
        )
        if not matched:
            return ""
        return html.unescape(matched.group("value")).strip()

    @staticmethod
    def _normalize_html(html_content: Union[str, bytes]) -> str:
        if isinstance(html_content, bytes):
            return html_content.decode("utf-8", errors="ignore")
        return html_content
新增征信员工HTML样本并改造Mock解析 2026-03-23 20:35:52 +08:00			`import html`
			`import re`
			`from typing import Dict, Union`


			`class CreditHtmlIdentityService:`
			`"""从征信样本 HTML 中提取员工姓名和身份证号。"""`

			`_META_TEMPLATE = r'<meta\s+name="{name}"\s+content="(?P<value>[^"]+)"\s*/?>'`

			`def extract_identity(self, html_content: Union[str, bytes]) -> Dict[str, str]:`
			`text = self._normalize_html(html_content)`
			`staff_name = self._extract_meta_content(text, "ccdi-staff-name")`
			`staff_id_card = self._extract_meta_content(text, "ccdi-staff-id-card")`

			`identity: Dict[str, str] = {}`
			`if staff_name:`
			`identity["staff_name"] = staff_name`
			`if staff_id_card:`
			`identity["staff_id_card"] = staff_id_card`
			`return identity`

			`def _extract_meta_content(self, text: str, meta_name: str) -> str:`
			`matched = re.search(`
			`self._META_TEMPLATE.format(name=re.escape(meta_name)),`
			`text,`
			`flags=re.IGNORECASE,`
			`)`
			`if not matched:`
			`return ""`
			`return html.unescape(matched.group("value")).strip()`

			`@staticmethod`
			`def _normalize_html(html_content: Union[str, bytes]) -> str:`
			`if isinstance(html_content, bytes):`
			`return html_content.decode("utf-8", errors="ignore")`
			`return html_content`