38 lines
1.3 KiB
Python
38 lines
1.3 KiB
Python
|
|
import html
|
||
|
|
import re
|
||
|
|
from typing import Dict, Union
|
||
|
|
|
||
|
|
|
||
|
|
class CreditHtmlIdentityService:
|
||
|
|
"""从征信样本 HTML 中提取员工姓名和身份证号。"""
|
||
|
|
|
||
|
|
_META_TEMPLATE = r'<meta\s+name="{name}"\s+content="(?P<value>[^"]+)"\s*/?>'
|
||
|
|
|
||
|
|
def extract_identity(self, html_content: Union[str, bytes]) -> Dict[str, str]:
|
||
|
|
text = self._normalize_html(html_content)
|
||
|
|
staff_name = self._extract_meta_content(text, "ccdi-staff-name")
|
||
|
|
staff_id_card = self._extract_meta_content(text, "ccdi-staff-id-card")
|
||
|
|
|
||
|
|
identity: Dict[str, str] = {}
|
||
|
|
if staff_name:
|
||
|
|
identity["staff_name"] = staff_name
|
||
|
|
if staff_id_card:
|
||
|
|
identity["staff_id_card"] = staff_id_card
|
||
|
|
return identity
|
||
|
|
|
||
|
|
def _extract_meta_content(self, text: str, meta_name: str) -> str:
|
||
|
|
matched = re.search(
|
||
|
|
self._META_TEMPLATE.format(name=re.escape(meta_name)),
|
||
|
|
text,
|
||
|
|
flags=re.IGNORECASE,
|
||
|
|
)
|
||
|
|
if not matched:
|
||
|
|
return ""
|
||
|
|
return html.unescape(matched.group("value")).strip()
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def _normalize_html(html_content: Union[str, bytes]) -> str:
|
||
|
|
if isinstance(html_content, bytes):
|
||
|
|
return html_content.decode("utf-8", errors="ignore")
|
||
|
|
return html_content
|