Files
ccdi/doc/scripts/generate_test_data.py

194 lines
7.6 KiB
Python
Raw Normal View History

2026-01-29 22:03:42 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
根据模板文件生成1000条个人中介黑名单测试数据
"""
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment
import random
from datetime import datetime
# 配置
TEMPLATE_FILE = "doc/个人中介黑名单模板_1769667622015.xlsx"
OUTPUT_FILE = "doc/个人中介黑名单测试数据_1000条_第2批.xlsx"
ROW_COUNT = 1000
# 姓氏和名字库
SURNAMES = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
GIVEN_NAMES = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '秀英', '', '', '', '桂英', '玉兰', '', '', '', '', '', '', '']
# 人员类型
INDIV_TYPES = ['中介', '职业背债人', '房产中介', '贷款中介', '其他']
# 人员子类型
INDIV_SUB_TYPES = ['本人', '配偶', '父亲', '母亲', '儿子', '女儿']
# 性别
GENDERS = ['', '']
# 证件类型
CERT_TYPES = ['身份证', '护照', '军官证', '其他']
# 关联关系
RELATIONS = ['配偶', '父母', '子女', '兄弟姐妹', '同事', '朋友', '合伙人', '其他']
# 公司类型
COMPANIES = ['中原地产', '链家地产', '我爱我家', '21世纪不动产', 'Q房网', '安居客', '房天下', '麦田房产', '鑫置地产', '嘉业地产']
# 职位
POSITIONS = ['经纪人', '高级经纪人', '店长', '区域经理', '业务员', '顾问', '总监', '助理', '专员']
# 城市和区域数据
CITIES = {
'北京': ['朝阳区', '海淀区', '东城区', '西城区', '丰台区', '通州区'],
'上海': ['浦东新区', '黄浦区', '徐汇区', '长宁区', '静安区', '普陀区'],
'广州': ['天河区', '越秀区', '海珠区', '荔湾区', '白云区', '番禺区'],
'深圳': ['福田区', '南山区', '罗湖区', '宝安区', '龙岗区', '盐田区'],
'杭州': ['西湖区', '上城区', '下城区', '江干区', '拱墅区', '滨江区'],
'成都': ['武侯区', '锦江区', '青羊区', '金牛区', '成华区', '高新区'],
'武汉': ['武昌区', '江岸区', '江汉区', '硚口区', '汉阳区', '洪山区'],
'南京': ['玄武区', '秦淮区', '建邺区', '鼓楼区', '浦口区', '栖霞区']
}
def generate_id_number(cert_type):
"""生成证件号码"""
if cert_type == '身份证':
# 生成18位身份证号码
area_code = f"{random.randint(110000, 659000)}"
birth = f"{random.randint(1960, 2000)}{random.randint(1, 12):02d}{random.randint(1, 28):02d}"
sequence = f"{random.randint(100, 999)}"
id_num = f"{area_code}{birth}{sequence}"
# 计算校验码
weights = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
check_codes = ['1', '0', 'X', '9', '8', '7', '6', '5', '4', '3', '2']
total = sum(int(id_num[i]) * weights[i] for i in range(17))
check_code = check_codes[total % 11]
return id_num + check_code
elif cert_type == '护照':
return f"E{random.randint(10000000, 99999999)}"
elif cert_type == '军官证':
return f"军字第{random.randint(1000000, 9999999)}"
else:
return f"QT{random.randint(100000000, 999999999)}"
def generate_phone():
"""生成手机号码"""
prefixes = ['130', '131', '132', '133', '134', '135', '136', '137', '138', '139',
'150', '151', '152', '153', '155', '156', '157', '158', '159',
'180', '181', '182', '183', '184', '185', '186', '187', '188', '189']
return f"{random.choice(prefixes)}{random.randint(10000000, 99999999)}"
def generate_wechat():
"""生成微信号"""
return f"wx_{random.randint(10000000, 99999999)}"
def generate_address():
"""生成联系地址"""
city = random.choice(list(CITIES.keys()))
district = random.choice(CITIES[city])
street = random.choice(['中山路', '解放路', '人民路', '建设路', '文化路', '和平路', '友谊路', '光明路'])
number = random.randint(1, 999)
building = random.choice(['A座', 'B座', '1号楼', '2号楼', '东苑', '西苑', '南区', '北区'])
room = random.randint(101, 2606)
return f"{city}{district}{street}{number}{building}{room}"
def generate_name():
"""生成姓名"""
surname = random.choice(SURNAMES)
if random.random() > 0.3: # 70%概率两个字的名字
return surname + random.choice(GIVEN_NAMES)
else: # 30%概率三个字的名字
return surname + random.choice(GIVEN_NAMES) + random.choice(GIVEN_NAMES)
def main():
"""主函数"""
print(f"正在读取模板文件: {TEMPLATE_FILE}")
try:
# 读取模板文件
wb = openpyxl.load_workbook(TEMPLATE_FILE)
ws = wb.active
# 获取表头
headers = []
for cell in ws[1]:
if cell.value:
headers.append(cell.value)
print(f"模板表头: {headers}")
print(f"开始生成 {ROW_COUNT} 条测试数据...")
# 清除除表头外的所有数据行
for row in range(2, ws.max_row + 1):
for col in range(1, ws.max_column + 1):
ws.cell(row=row, column=col).value = None
# 生成数据行
for i in range(2, ROW_COUNT + 2):
indiv_type = random.choice(INDIV_TYPES)
gender = random.choice(GENDERS)
cert_type = random.choice(CERT_TYPES)
# 根据表头索引填充数据
row_data = {
'姓名': generate_name(),
'证件号码': generate_id_number(cert_type),
'人员类型': indiv_type,
'人员子类型': random.choice(INDIV_SUB_TYPES),
'性别': gender,
'证件类型': cert_type,
'手机号': generate_phone(),
'微信号': generate_wechat(),
'联系地址': generate_address(),
'所在公司': random.choice(COMPANIES),
'职位': random.choice(POSITIONS),
'关联人员ID': str(random.randint(1000, 9999)) if random.random() > 0.8 else '',
'关联关系': random.choice(RELATIONS) if random.random() > 0.5 else '',
'备注': f'测试数据{i-1}'
}
# 写入行数据
for col_idx, header in enumerate(headers, start=1):
if header in row_data:
ws.cell(row=i, column=col_idx, value=row_data[header])
if (i - 1) % 100 == 0:
print(f"已生成 {i-1} 条数据...")
# 保存文件
print(f"\n正在保存文件到: {OUTPUT_FILE}")
wb.save(OUTPUT_FILE)
print(f"✓ 成功生成 {ROW_COUNT} 条测试数据")
print(f"✓ 文件已保存至: {OUTPUT_FILE}")
print(f"✓ 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# 输出前3条数据示例
print("\n前3条数据示例:")
print("-" * 100)
for i in range(2, 5):
row_data = []
for col_idx in range(1, len(headers) + 1):
val = ws.cell(row=i, column=col_idx).value
row_data.append(str(val) if val else "")
print(f"{i-1}行: {', '.join([f'{h}:{v}' for h, v in zip(headers[:6], row_data[:6])])}")
except FileNotFoundError:
print(f"✗ 错误:找不到模板文件 {TEMPLATE_FILE}")
print("请确保模板文件存在于正确的路径")
except Exception as e:
print(f"✗ 错误:{str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()