182 lines
7.7 KiB
Python
182 lines
7.7 KiB
Python
import random
|
||
import string
|
||
from datetime import datetime, timedelta
|
||
import pandas as pd
|
||
|
||
# 机构名称前缀
|
||
company_prefixes = ['北京市', '上海市', '广州市', '深圳市', '杭州市', '成都市', '武汉市', '南京市', '西安市', '重庆市']
|
||
company_keywords = ['房产', '地产', '置业', '中介', '经纪', '咨询', '投资', '资产', '物业', '不动产']
|
||
company_suffixes = ['有限公司', '股份有限公司', '集团', '企业', '合伙企业', '有限责任公司']
|
||
|
||
# 主体类型
|
||
entity_types = ['企业', '个体工商户', '农民专业合作社', '其他组织']
|
||
|
||
# 企业性质
|
||
enterprise_natures = ['国有企业', '集体企业', '私营企业', '混合所有制企业', '外商投资企业', '港澳台投资企业']
|
||
|
||
# 行业分类
|
||
industry_classes = ['房地产业', '金融业', '租赁和商务服务业', '建筑业', '批发和零售业']
|
||
|
||
# 所属行业
|
||
industry_names = [
|
||
'房地产中介服务', '房地产经纪', '房地产开发经营', '物业管理',
|
||
'投资咨询', '资产管理', '商务咨询', '市场调查',
|
||
'建筑工程', '装饰装修', '园林绿化'
|
||
]
|
||
|
||
# 法定代表人姓名
|
||
surnames = ['王', '李', '张', '刘', '陈', '杨', '黄', '赵', '周', '吴', '徐', '孙', '马', '胡', '朱', '郭', '何', '罗', '高', '林']
|
||
given_names = ['伟', '芳', '娜', '敏', '静', '丽', '强', '磊', '军', '洋', '勇', '艳', '杰', '娟', '涛', '明', '超', '秀英', '霞', '平']
|
||
|
||
# 证件类型
|
||
cert_types = ['身份证', '护照', '港澳通行证', '台胞证', '其他']
|
||
|
||
# 常用地址
|
||
provinces = ['北京市', '上海市', '广东省', '浙江省', '江苏省', '四川省', '湖北省', '河南省', '山东省', '福建省']
|
||
cities = ['朝阳区', '海淀区', '浦东新区', '黄浦区', '天河区', '福田区', '西湖区', '滨江区', '鼓楼区', '玄武区',
|
||
'武侯区', '江汉区', '金水区', '市南区', '思明区']
|
||
districts = ['街道', '大道', '路', '巷', '小区', '花园', '广场', '大厦']
|
||
street_numbers = ['1号', '2号', '3号', '88号', '66号', '108号', '188号', '888号', '666号', '168号']
|
||
|
||
# 股东姓名
|
||
shareholder_names = [
|
||
'张伟', '李芳', '王强', '刘军', '陈静', '杨洋', '黄勇', '赵艳',
|
||
'周杰', '吴娟', '徐涛', '孙明', '马超', '胡秀英', '朱霞', '郭平',
|
||
'何桂英', '罗玉兰', '高萍', '林毅', '王浩', '李宇', '张轩', '刘然'
|
||
]
|
||
|
||
def generate_company_name():
|
||
"""生成机构名称"""
|
||
prefix = random.choice(company_prefixes)
|
||
keyword = random.choice(company_keywords)
|
||
suffix = random.choice(company_suffixes)
|
||
return f"{prefix}{keyword}{suffix}"
|
||
|
||
def generate_social_credit_code():
|
||
"""生成统一社会信用代码(18位)"""
|
||
# 统一社会信用代码规则:18位,第一位为登记管理部门代码(1-5),第二位为机构类别代码(1-9)
|
||
dept_code = random.choice(['1', '2', '3', '4', '5'])
|
||
org_code = random.choice(['1', '2', '3', '4', '5', '6', '7', '8', '9'])
|
||
rest = ''.join([str(random.randint(0, 9)) for _ in range(16)])
|
||
return f"{dept_code}{org_code}{rest}"
|
||
|
||
def generate_id_card():
|
||
"""生成身份证号码(18位,简化版)"""
|
||
# 地区码(前6位)
|
||
area_code = f"{random.randint(110000, 650000):06d}"
|
||
# 出生日期(8位)
|
||
birth_year = random.randint(1960, 1990)
|
||
birth_month = f"{random.randint(1, 12):02d}"
|
||
birth_day = f"{random.randint(1, 28):02d}"
|
||
birth_date = f"{birth_year}{birth_month}{birth_day}"
|
||
# 顺序码(3位)
|
||
sequence = f"{random.randint(1, 999):03d}"
|
||
# 校验码(1位)
|
||
check_code = random.randint(0, 9)
|
||
return f"{area_code}{birth_date}{sequence}{check_code}"
|
||
|
||
def generate_other_id():
|
||
"""生成其他证件号码"""
|
||
return f"{random.randint(10000000, 99999999):08d}"
|
||
|
||
def generate_register_address():
|
||
"""生成注册地址"""
|
||
province = random.choice(provinces)
|
||
city = random.choice(cities)
|
||
district = random.choice(districts)
|
||
number = random.choice(street_numbers)
|
||
return f"{province}{city}{district}{number}"
|
||
|
||
def generate_establish_date():
|
||
"""生成成立日期(2000-2024年之间)"""
|
||
start_date = datetime(2000, 1, 1)
|
||
end_date = datetime(2024, 12, 31)
|
||
time_between = end_date - start_date
|
||
days_between = time_between.days
|
||
random_days = random.randrange(days_between)
|
||
return start_date + timedelta(days=random_days)
|
||
|
||
def generate_legal_representative():
|
||
"""生成法定代表人"""
|
||
name = random.choice(surnames) + random.choice(given_names)
|
||
cert_type = random.choice(cert_types)
|
||
cert_no = generate_id_card() if cert_type == '身份证' else generate_other_id()
|
||
return name, cert_type, cert_no
|
||
|
||
def generate_shareholders():
|
||
"""生成股东列表(1-5个股东)"""
|
||
shareholder_count = random.randint(1, 5)
|
||
selected_shareholders = random.sample(shareholder_names, shareholder_count)
|
||
shareholders = [None] * 5
|
||
for i, shareholder in enumerate(selected_shareholders):
|
||
shareholders[i] = shareholder
|
||
return shareholders
|
||
|
||
def generate_entity(index):
|
||
"""生成单条机构中介数据"""
|
||
# 基本信息
|
||
enterprise_name = generate_company_name()
|
||
social_credit_code = generate_social_credit_code()
|
||
entity_type = random.choice(entity_types)
|
||
enterprise_nature = random.choice(enterprise_natures)
|
||
industry_class = random.choice(industry_classes)
|
||
industry_name = random.choice(industry_names)
|
||
|
||
# 成立日期
|
||
establish_date = generate_establish_date()
|
||
|
||
# 注册地址
|
||
register_address = generate_register_address()
|
||
|
||
# 法定代表人信息
|
||
legal_name, legal_cert_type, legal_cert_no = generate_legal_representative()
|
||
|
||
# 股东
|
||
shareholders = generate_shareholders()
|
||
|
||
return {
|
||
'机构名称*': enterprise_name,
|
||
'统一社会信用代码*': social_credit_code,
|
||
'主体类型': entity_type,
|
||
'企业性质': enterprise_nature if random.random() > 0.3 else '',
|
||
'行业分类': industry_class if random.random() > 0.3 else '',
|
||
'所属行业': industry_name if random.random() > 0.2 else '',
|
||
'成立日期': establish_date.strftime('%Y-%m-%d') if random.random() > 0.4 else '',
|
||
'注册地址': register_address,
|
||
'法定代表人': legal_name,
|
||
'法定代表人证件类型': legal_cert_type,
|
||
'法定代表人证件号码': legal_cert_no,
|
||
'股东1': shareholders[0] if shareholders[0] else '',
|
||
'股东2': shareholders[1] if shareholders[1] else '',
|
||
'股东3': shareholders[2] if shareholders[2] else '',
|
||
'股东4': shareholders[3] if shareholders[3] else '',
|
||
'股东5': shareholders[4] if shareholders[4] else '',
|
||
'备注': f'测试数据{index}' if random.random() > 0.5 else ''
|
||
}
|
||
|
||
# 生成第一个1000条数据
|
||
print("正在生成第一批1000条机构中介黑名单数据...")
|
||
data = [generate_entity(i) for i in range(1, 1001)]
|
||
df = pd.DataFrame(data)
|
||
|
||
# 保存第一个文件
|
||
output1 = r'D:\ccdi\ccdi\doc\test-data\intermediary\机构中介黑名单测试数据_1000条_第1批.xlsx'
|
||
df.to_excel(output1, index=False, engine='openpyxl')
|
||
print(f"已生成第一个文件: {output1}")
|
||
|
||
# 生成第二个1000条数据
|
||
print("正在生成第二批1000条机构中介黑名单数据...")
|
||
data2 = [generate_entity(i) for i in range(1, 1001)]
|
||
df2 = pd.DataFrame(data2)
|
||
|
||
# 保存第二个文件
|
||
output2 = r'D:\ccdi\ccdi\doc\test-data\intermediary\机构中介黑名单测试数据_1000条_第2批.xlsx'
|
||
df2.to_excel(output2, index=False, engine='openpyxl')
|
||
print(f"已生成第二个文件: {output2}")
|
||
|
||
print("\n✅ 生成完成!")
|
||
print(f"文件1: {output1}")
|
||
print(f"文件2: {output2}")
|
||
print(f"\n每个文件包含1000条测试数据")
|
||
print(f"数据格式与CcdiIntermediaryEntityExcel.java定义一致")
|