Files
ccdi/doc/test-data/intermediary/generate-test-data-1000-valid.py

216 lines
7.8 KiB
Python
Raw Normal View History

2026-02-09 00:13:32 +08:00
import pandas as pd
import random
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Alignment
def calculate_id_check_code(id_17):
"""
计算身份证校验码符合GB 11643-1999标准
:param id_17: 前17位身份证号
:return: 校验码0-9或X
"""
# 权重因子
weights = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
# 校验码对应表
check_codes = ['1', '0', 'X', '9', '8', '7', '6', '5', '4', '3', '2']
# 计算加权和
weighted_sum = sum(int(id_17[i]) * weights[i] for i in range(17))
# 取模得到索引
mod = weighted_sum % 11
# 返回对应的校验码
return check_codes[mod]
def generate_valid_person_id(id_type):
"""
生成符合校验标准的证件号码
"""
if id_type == '身份证':
# 6位地区码 + 4位年份 + 2位月份 + 2位日期 + 3位顺序码
area_code = f"{random.randint(110000, 659999)}"
birth_year = random.randint(1960, 2000)
birth_month = f"{random.randint(1, 12):02d}"
birth_day = f"{random.randint(1, 28):02d}"
sequence_code = f"{random.randint(0, 999):03d}"
# 前17位
id_17 = f"{area_code}{birth_year}{birth_month}{birth_day}{sequence_code}"
# 计算校验码
check_code = calculate_id_check_code(id_17)
return f"{id_17}{check_code}"
else:
# 护照、台胞证、港澳通行证8位数字
return str(random.randint(10000000, 99999999))
# 验证身份证校验码
def validate_id_check_code(person_id):
"""
验证身份证校验码是否正确
"""
if len(person_id) != 18:
return False
id_17 = person_id[:17]
check_code = person_id[17]
return calculate_id_check_code(id_17) == check_code.upper()
# 定义数据生成规则
last_names = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
first_names_male = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
first_names_female = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
person_types = ['中介']
person_sub_types = ['本人', '配偶', '子女', '父母', '其他']
genders = ['M', 'F', 'O']
id_types = ['身份证', '护照', '台胞证', '港澳通行证']
companies = ['房屋租赁公司', '房产经纪公司', '投资咨询公司', '置业咨询公司', '不动产咨询公司', '物业管理公司', '资产评估公司', '土地评估公司', '地产代理公司', '房产咨询公司']
positions = ['区域经理', '店长', '高级经纪人', '房产经纪人', '销售经理', '置业顾问', '物业顾问', '评估师', '业务员', '总监', '主管', None]
relation_types = ['配偶', '子女', '父母', '兄弟姐妹', None, None]
provinces = ['北京市', '上海市', '广东省', '江苏省', '浙江省', '四川省', '河南省', '福建省', '湖北省', '湖南省']
districts = ['海淀区', '朝阳区', '天河区', '浦东新区', '西湖区', '黄浦区', '静安区', '徐汇区', '福田区', '罗湖区']
streets = ['', '大街', '大道', '街道', '', '广场', '大厦', '花园']
buildings = ['1号楼', '2号楼', '3号楼', '4号楼', '5号楼', '6号楼', '7号楼', '8号楼', 'A座', 'B座']
def generate_name(gender):
first_names = first_names_male if gender == 'M' else first_names_female
return random.choice(last_names) + random.choice(first_names)
def generate_mobile():
return f"1{random.choice([3, 5, 7, 8, 9])}{random.randint(0, 9)}{random.randint(10000000, 99999999)}"
def generate_wechat():
return f"wx_{''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=8))}"
def generate_address():
return f"{random.choice(provinces)}{random.choice(districts)}{random.choice(streets)}{random.randint(1, 100)}"
def generate_social_credit_code():
return f"91{random.randint(0, 9)}{random.randint(10000000000000000, 99999999999999999)}"
def generate_related_num_id():
return f"ID{random.randint(10000, 99999)}"
def generate_row(index):
gender = random.choice(genders)
person_sub_type = random.choice(person_sub_types)
id_type = random.choice(id_types)
return {
'姓名*': generate_name(gender),
'人员类型': '中介',
'人员子类型': person_sub_type,
'性别': gender,
'证件类型': id_type,
'证件号码*': generate_valid_person_id(id_type),
'手机号码': generate_mobile(),
'微信号': random.choice([generate_wechat(), None, None]),
'联系地址': generate_address(),
'所在公司': random.choice(companies),
'企业统一信用码': random.choice([generate_social_credit_code(), None, None]),
'职位': random.choice(positions),
'关联人员ID': random.choice([generate_related_num_id(), None, None, None]),
'关系类型': random.choice(relation_types),
'备注': None
}
# 生成1000条数据
print("正在生成1000条测试数据...")
data = []
for i in range(1000):
row = generate_row(i)
data.append(row)
if (i + 1) % 100 == 0:
print(f"已生成 {i + 1} 条...")
# 创建DataFrame
df = pd.DataFrame(data)
# 输出文件
output_file = 'doc/test-data/intermediary/intermediary_test_data_1000_valid.xlsx'
# 保存到Excel
df.to_excel(output_file, index=False, engine='openpyxl')
# 格式化Excel文件
wb = load_workbook(output_file)
ws = wb.active
# 设置列宽
ws.column_dimensions['A'].width = 15
ws.column_dimensions['B'].width = 12
ws.column_dimensions['C'].width = 12
ws.column_dimensions['D'].width = 8
ws.column_dimensions['E'].width = 12
ws.column_dimensions['F'].width = 20
ws.column_dimensions['G'].width = 15
ws.column_dimensions['H'].width = 15
ws.column_dimensions['I'].width = 30
ws.column_dimensions['J'].width = 20
ws.column_dimensions['K'].width = 20
ws.column_dimensions['L'].width = 12
ws.column_dimensions['M'].width = 15
ws.column_dimensions['N'].width = 12
ws.column_dimensions['O'].width = 20
# 设置表头样式
header_fill = PatternFill(start_color='D3D3D3', end_color='D3D3D3', fill_type='solid')
header_font = Font(bold=True)
for cell in ws[1]:
cell.fill = header_fill
cell.font = header_font
cell.alignment = Alignment(horizontal='center', vertical='center')
# 冻结首行
ws.freeze_panes = 'A2'
wb.save(output_file)
# 验证身份证校验码
print("\n正在验证身份证校验码...")
df_read = pd.read_excel(output_file)
id_cards = df_read[df_read['证件类型'] == '身份证']['证件号码*']
valid_count = 0
invalid_count = 0
invalid_ids = []
for idx, person_id in id_cards.items():
if validate_id_check_code(str(person_id)):
valid_count += 1
else:
invalid_count += 1
invalid_ids.append(person_id)
print(f"\n✅ 成功生成1000条测试数据到: {output_file}")
print(f"\n=== 身份证校验码验证 ===")
print(f"身份证总数: {len(id_cards)}")
print(f"校验正确: {valid_count}条 ✅")
print(f"校验错误: {invalid_count}")
if invalid_count > 0:
print(f"\n错误的身份证号:")
for pid in invalid_ids[:10]:
print(f" {pid}")
print(f"\n=== 数据统计 ===")
print(f"人员类型: {df_read['人员类型'].unique()}")
print(f"性别分布: {dict(df_read['性别'].value_counts())}")
print(f"证件类型分布: {dict(df_read['证件类型'].value_counts())}")
print(f"人员子类型分布: {dict(df_read['人员子类型'].value_counts())}")
print(f"\n=== 身份证号码样本(已验证校验码)===")
valid_id_samples = id_cards.head(5).tolist()
for sample in valid_id_samples:
is_valid = "" if validate_id_check_code(str(sample)) else ""
print(f"{sample} {is_valid}")