Files
ccdi/doc/test-data/generate_org_data.py

193 lines
7.9 KiB
Python
Raw Normal View History

2026-01-29 22:03:42 +08:00
import openpyxl
from openpyxl import Workbook
import random
from datetime import datetime, timedelta
# 机构名称前缀
org_prefixes = [
"北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "武汉", "西安", "南京",
"天津", "苏州", "长沙", "郑州", "东莞", "青岛", "沈阳", "宁波", "厦门", "佛山"
]
# 机构类型关键词
org_types = [
"投资咨询", "资产管理", "证券投资", "基金管理", "股权投资",
"财富管理", "金融信息服务", "商务咨询", "企业咨询", "投资顾问"
]
# 机构后缀
org_suffixes = ["有限公司", "股份有限公司", "集团", "企业", "事务所"]
# 主体类型
entity_types = ["企业", "事业单位", "社会组织"]
# 企业性质
corp_natures = [
"有限责任公司", "股份有限公司", "国有独资", "集体企业",
"私营企业", "中外合资", "外商独资", "港澳台合资"
]
# 行业分类
industry_classes = ["金融业", "商务服务业", "科学研究和技术服务业"]
# 所属行业
industries = [
"货币金融服务", "资本市场服务", "保险业", "其他金融业",
"企业管理服务", "法律服务", "咨询与调查", "广告业",
"研究和试验发展", "专业技术服务业", "科技推广和应用服务业"
]
# 证件类型
id_types = ["身份证", "护照", "其他"]
# 统一社会信用代码生成18位
def generate_credit_code():
area_code = f"{random.randint(110000, 659900):06d}"
org_code = ''.join([str(random.randint(0, 9)) for _ in range(9)])
check_code = random.randint(0, 9)
return f"{area_code}{org_code}{check_code}"
# 生成法定代表人姓名
def generate_person_name():
surnames = ["", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", ""]
names1 = ["", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "秀英", "", ""]
names2 = ["", "", "", "", "", "", "", "", "", ""]
return random.choice(surnames) + random.choice(names1) + random.choice(names2)
# 生成身份证号18位
def generate_id_card():
# 地区码(6位) + 出生日期(8位) + 顺序码(3位) + 校验码(1位)
area_code = f"{random.randint(110000, 659900):06d}"
year = random.randint(1960, 1995)
month = f"{random.randint(1, 12):02d}"
day = f"{random.randint(1, 28):02d}"
birth_date = f"{year}{month}{day}"
sequence = f"{random.randint(1, 999):03d}"
check_code = random.randint(0, 9)
return f"{area_code}{birth_date}{sequence}{check_code}"
# 生成注册地址
def generate_address():
districts = ["朝阳区", "海淀区", "西城区", "东城区", "丰台区",
"浦东新区", "黄浦区", "静安区", "徐汇区", "天河区",
"福田区", "南山区", "罗湖区", "西湖区", "江干区"]
streets = ["建设路", "人民路", "解放路", "和平路", "文化路",
"科技路", "创新路", "发展路", "创业路", "工业路"]
buildings = ["大厦", "中心", "广场", "写字楼", "科技园"]
return f"{random.choice(districts)}{random.choice(streets)}{random.randint(1,999)}{random.choice(buildings)}"
# 生成成立日期
def generate_establish_date():
start_date = datetime(2000, 1, 1)
end_date = datetime(2024, 12, 31)
days_between = (end_date - start_date).days
random_days = random.randint(0, days_between)
return (start_date + timedelta(days=random_days)).strftime("%Y-%m-%d")
# 生成股东名称
def generate_shareholder():
types = [
lambda: f"{random.choice(org_prefixes)}{random.choice(['投资', '资本', '控股', '集团'])}有限公司",
lambda: generate_person_name() + random.choice(["", "(自然人)"])
]
return random.choice(types)()
# 生成备注
def generate_remark():
remarks = [
"", "", "", "",
"重点监控", "已整改", "存在风险", "待核查"
]
return random.choice(remarks)
# 生成单条机构数据
def generate_org_data(index):
# 随机决定有几个股东1-5个
shareholder_count = random.randint(1, 5)
shareholders = [generate_shareholder() for _ in range(shareholder_count)]
# 补齐到5个
while len(shareholders) < 5:
shareholders.append("")
# 证件类型
id_type = random.choice(id_types)
id_card = generate_id_card() if id_type == "身份证" else f"{random.choice(['A', 'B', 'C'])}{random.randint(10000, 99999)}"
return {
"id": index,
"orgName": f"{random.choice(org_prefixes)}{random.choice(org_types)}{random.choice(org_suffixes)}",
"creditCode": generate_credit_code(),
"entityType": random.choice(entity_types),
"corpNature": random.choice(corp_natures) if random.choice([True, False]) else "",
"industryClass": random.choice(industry_classes),
"industry": random.choice(industries),
"establishDate": generate_establish_date(),
"regAddress": generate_address(),
"legalRep": generate_person_name(),
"legalRepIdType": id_type,
"legalRepIdNo": id_card,
"shareholder1": shareholders[0],
"shareholder2": shareholders[1],
"shareholder3": shareholders[2],
"shareholder4": shareholders[3],
"shareholder5": shareholders[4],
"remark": generate_remark()
}
# 生成数据并保存到Excel
def generate_org_test_data(filename, count=1000, start_id=1):
# 读取模板获取表头
template_path = "机构中介黑名单模板_1769674571626.xlsx"
template_wb = openpyxl.load_workbook(template_path)
template_ws = template_wb.active
# 创建新工作簿
wb = Workbook()
ws = wb.active
ws.title = "机构中介黑名单"
# 复制表头
for cell in template_ws[1]:
new_cell = ws.cell(row=1, column=cell.column, value=cell.value)
# 生成数据
data_list = []
for i in range(count):
data = generate_org_data(start_id + i)
data_list.append(data)
# 按照模板列顺序写入数据
# 列顺序:机构名称、统一社会信用代码、主体类型、企业性质、行业分类、所属行业、
# 成立日期、注册地址、法定代表人、法定代表人证件类型、法定代表人证件号码、
# 股东1、股东2、股东3、股东4、股东5、备注
for row_idx, data in enumerate(data_list, start=2):
ws.cell(row=row_idx, column=1, value=data["orgName"])
ws.cell(row=row_idx, column=2, value=data["creditCode"])
ws.cell(row=row_idx, column=3, value=data["entityType"])
ws.cell(row=row_idx, column=4, value=data["corpNature"])
ws.cell(row=row_idx, column=5, value=data["industryClass"])
ws.cell(row=row_idx, column=6, value=data["industry"])
ws.cell(row=row_idx, column=7, value=data["establishDate"])
ws.cell(row=row_idx, column=8, value=data["regAddress"])
ws.cell(row=row_idx, column=9, value=data["legalRep"])
ws.cell(row=row_idx, column=10, value=data["legalRepIdType"])
ws.cell(row=row_idx, column=11, value=data["legalRepIdNo"])
ws.cell(row=row_idx, column=12, value=data["shareholder1"])
ws.cell(row=row_idx, column=13, value=data["shareholder2"])
ws.cell(row=row_idx, column=14, value=data["shareholder3"])
ws.cell(row=row_idx, column=15, value=data["shareholder4"])
ws.cell(row=row_idx, column=16, value=data["shareholder5"])
ws.cell(row=row_idx, column=17, value=data["remark"])
# 保存文件
wb.save(filename)
print(f"已生成文件: {filename}")
if __name__ == "__main__":
print("开始生成机构中介黑名单测试数据...")
generate_org_test_data("机构中介黑名单测试数据_1000条.xlsx", 1000, 1)
generate_org_test_data("机构中介黑名单测试数据_1000条_第2批.xlsx", 1000, 1001)
print("完成!")