Files
ccdi/doc/test-data/intermediary/generate-test-data-1000-valid.py
2026-02-09 00:13:32 +08:00

216 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pandas as pd
import random
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Alignment
def calculate_id_check_code(id_17):
"""
计算身份证校验码符合GB 11643-1999标准
:param id_17: 前17位身份证号
:return: 校验码0-9或X
"""
# 权重因子
weights = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
# 校验码对应表
check_codes = ['1', '0', 'X', '9', '8', '7', '6', '5', '4', '3', '2']
# 计算加权和
weighted_sum = sum(int(id_17[i]) * weights[i] for i in range(17))
# 取模得到索引
mod = weighted_sum % 11
# 返回对应的校验码
return check_codes[mod]
def generate_valid_person_id(id_type):
"""
生成符合校验标准的证件号码
"""
if id_type == '身份证':
# 6位地区码 + 4位年份 + 2位月份 + 2位日期 + 3位顺序码
area_code = f"{random.randint(110000, 659999)}"
birth_year = random.randint(1960, 2000)
birth_month = f"{random.randint(1, 12):02d}"
birth_day = f"{random.randint(1, 28):02d}"
sequence_code = f"{random.randint(0, 999):03d}"
# 前17位
id_17 = f"{area_code}{birth_year}{birth_month}{birth_day}{sequence_code}"
# 计算校验码
check_code = calculate_id_check_code(id_17)
return f"{id_17}{check_code}"
else:
# 护照、台胞证、港澳通行证8位数字
return str(random.randint(10000000, 99999999))
# 验证身份证校验码
def validate_id_check_code(person_id):
"""
验证身份证校验码是否正确
"""
if len(person_id) != 18:
return False
id_17 = person_id[:17]
check_code = person_id[17]
return calculate_id_check_code(id_17) == check_code.upper()
# 定义数据生成规则
last_names = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
first_names_male = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
first_names_female = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
person_types = ['中介']
person_sub_types = ['本人', '配偶', '子女', '父母', '其他']
genders = ['M', 'F', 'O']
id_types = ['身份证', '护照', '台胞证', '港澳通行证']
companies = ['房屋租赁公司', '房产经纪公司', '投资咨询公司', '置业咨询公司', '不动产咨询公司', '物业管理公司', '资产评估公司', '土地评估公司', '地产代理公司', '房产咨询公司']
positions = ['区域经理', '店长', '高级经纪人', '房产经纪人', '销售经理', '置业顾问', '物业顾问', '评估师', '业务员', '总监', '主管', None]
relation_types = ['配偶', '子女', '父母', '兄弟姐妹', None, None]
provinces = ['北京市', '上海市', '广东省', '江苏省', '浙江省', '四川省', '河南省', '福建省', '湖北省', '湖南省']
districts = ['海淀区', '朝阳区', '天河区', '浦东新区', '西湖区', '黄浦区', '静安区', '徐汇区', '福田区', '罗湖区']
streets = ['', '大街', '大道', '街道', '', '广场', '大厦', '花园']
buildings = ['1号楼', '2号楼', '3号楼', '4号楼', '5号楼', '6号楼', '7号楼', '8号楼', 'A座', 'B座']
def generate_name(gender):
first_names = first_names_male if gender == 'M' else first_names_female
return random.choice(last_names) + random.choice(first_names)
def generate_mobile():
return f"1{random.choice([3, 5, 7, 8, 9])}{random.randint(0, 9)}{random.randint(10000000, 99999999)}"
def generate_wechat():
return f"wx_{''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=8))}"
def generate_address():
return f"{random.choice(provinces)}{random.choice(districts)}{random.choice(streets)}{random.randint(1, 100)}"
def generate_social_credit_code():
return f"91{random.randint(0, 9)}{random.randint(10000000000000000, 99999999999999999)}"
def generate_related_num_id():
return f"ID{random.randint(10000, 99999)}"
def generate_row(index):
gender = random.choice(genders)
person_sub_type = random.choice(person_sub_types)
id_type = random.choice(id_types)
return {
'姓名*': generate_name(gender),
'人员类型': '中介',
'人员子类型': person_sub_type,
'性别': gender,
'证件类型': id_type,
'证件号码*': generate_valid_person_id(id_type),
'手机号码': generate_mobile(),
'微信号': random.choice([generate_wechat(), None, None]),
'联系地址': generate_address(),
'所在公司': random.choice(companies),
'企业统一信用码': random.choice([generate_social_credit_code(), None, None]),
'职位': random.choice(positions),
'关联人员ID': random.choice([generate_related_num_id(), None, None, None]),
'关系类型': random.choice(relation_types),
'备注': None
}
# 生成1000条数据
print("正在生成1000条测试数据...")
data = []
for i in range(1000):
row = generate_row(i)
data.append(row)
if (i + 1) % 100 == 0:
print(f"已生成 {i + 1} 条...")
# 创建DataFrame
df = pd.DataFrame(data)
# 输出文件
output_file = 'doc/test-data/intermediary/intermediary_test_data_1000_valid.xlsx'
# 保存到Excel
df.to_excel(output_file, index=False, engine='openpyxl')
# 格式化Excel文件
wb = load_workbook(output_file)
ws = wb.active
# 设置列宽
ws.column_dimensions['A'].width = 15
ws.column_dimensions['B'].width = 12
ws.column_dimensions['C'].width = 12
ws.column_dimensions['D'].width = 8
ws.column_dimensions['E'].width = 12
ws.column_dimensions['F'].width = 20
ws.column_dimensions['G'].width = 15
ws.column_dimensions['H'].width = 15
ws.column_dimensions['I'].width = 30
ws.column_dimensions['J'].width = 20
ws.column_dimensions['K'].width = 20
ws.column_dimensions['L'].width = 12
ws.column_dimensions['M'].width = 15
ws.column_dimensions['N'].width = 12
ws.column_dimensions['O'].width = 20
# 设置表头样式
header_fill = PatternFill(start_color='D3D3D3', end_color='D3D3D3', fill_type='solid')
header_font = Font(bold=True)
for cell in ws[1]:
cell.fill = header_fill
cell.font = header_font
cell.alignment = Alignment(horizontal='center', vertical='center')
# 冻结首行
ws.freeze_panes = 'A2'
wb.save(output_file)
# 验证身份证校验码
print("\n正在验证身份证校验码...")
df_read = pd.read_excel(output_file)
id_cards = df_read[df_read['证件类型'] == '身份证']['证件号码*']
valid_count = 0
invalid_count = 0
invalid_ids = []
for idx, person_id in id_cards.items():
if validate_id_check_code(str(person_id)):
valid_count += 1
else:
invalid_count += 1
invalid_ids.append(person_id)
print(f"\n✅ 成功生成1000条测试数据到: {output_file}")
print(f"\n=== 身份证校验码验证 ===")
print(f"身份证总数: {len(id_cards)}")
print(f"校验正确: {valid_count}条 ✅")
print(f"校验错误: {invalid_count}")
if invalid_count > 0:
print(f"\n错误的身份证号:")
for pid in invalid_ids[:10]:
print(f" {pid}")
print(f"\n=== 数据统计 ===")
print(f"人员类型: {df_read['人员类型'].unique()}")
print(f"性别分布: {dict(df_read['性别'].value_counts())}")
print(f"证件类型分布: {dict(df_read['证件类型'].value_counts())}")
print(f"人员子类型分布: {dict(df_read['人员子类型'].value_counts())}")
print(f"\n=== 身份证号码样本(已验证校验码)===")
valid_id_samples = id_cards.head(5).tolist()
for sample in valid_id_samples:
is_valid = "" if validate_id_check_code(str(sample)) else ""
print(f"{sample} {is_valid}")