TenkajinKB/parse_data.py
2026-04-19 16:16:20 +08:00

270 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Parse source markdown to generate complete n01_phylums.json with ALL orders and families."""
import json
import re
def species_to_rank(species):
"""Derive rank from species count."""
if species >= 20000: return 'S'
if species >= 5000: return 'A'
if species >= 1000: return 'B'
if species >= 100: return 'C'
if species >= 10: return 'D'
if species >= 2: return 'E'
return 'F'
with open('01_Active_Projects/N01_植物帝国企划/owm植物学AVGIV总览.md', 'r', encoding='utf-8') as f:
content = f.read()
sections = content.split('## ')
# ─── 1. Parse 目视角 for all angiosperm order data ───
orders_data = {}
for s in sections:
if s.startswith('目视角'):
lines = s.strip().split('\n')
for line in lines:
if line.startswith('|'):
cols = [c.strip() for c in line.split('|')]
if len(cols) > 14 and cols[8].strip().isdigit():
num = int(cols[8])
rank = cols[9]
cn_name = cols[10]
latin = cols[11]
mode_genus = cols[12]
fam_count = cols[13]
gen_count = cols[14]
spe_count = cols[15]
spe_err = cols[16] if len(cols) > 16 else '0'
orders_data[cn_name] = {
'num': num,
'rank': rank,
'name': cn_name,
'latin': latin,
'modeGenus': mode_genus if mode_genus != 'nan' else '',
'families': int(fam_count) if fam_count.isdigit() else 0,
'genera': int(gen_count) if gen_count.isdigit() else 0,
'species': int(spe_count) if spe_count.isdigit() else 0,
'error': int(spe_err) if spe_err.replace('-', '').isdigit() else 0,
'family_list': []
}
break
print(f'Angiosperm orders from 目视角: {len(orders_data)}')
# Add gymnosperm orders manually (from 设定详细 and Sheet4)
gymno_orders = {
'苏铁目': {'num': 65, 'rank': 'C', 'name': '苏铁目', 'latin': 'Cycadales',
'families': 2, 'genera': 10, 'species': 318, 'error': 6, 'family_list': []},
'银杏目': {'num': 66, 'rank': 'F', 'name': '银杏目', 'latin': 'Ginkgoales',
'families': 1, 'genera': 1, 'species': 1, 'error': 0, 'family_list': []},
'南洋杉目': {'num': 67, 'rank': 'C', 'name': '南洋杉目', 'latin': 'Araucariales',
'families': 2, 'genera': 6, 'species': 71, 'error': 5, 'family_list': []},
'柏目': {'num': 68, 'rank': 'C', 'name': '柏目', 'latin': 'Cupressales',
'families': 4, 'genera': 32, 'species': 170, 'error': 10, 'family_list': []},
'松目': {'num': 69, 'rank': 'C', 'name': '松目', 'latin': 'Pinales',
'families': 1, 'genera': 11, 'species': 255, 'error': 5, 'family_list': []},
'麻黄目': {'num': 70, 'rank': 'E', 'name': '麻黄目', 'latin': 'Ephedrales',
'families': 1, 'genera': 1, 'species': 65, 'error': 3, 'family_list': []},
'百岁兰目': {'num': 71, 'rank': 'F', 'name': '百岁兰目', 'latin': 'Welwitschiales',
'families': 1, 'genera': 1, 'species': 1, 'error': 0, 'family_list': []},
'买麻藤目': {'num': 72, 'rank': 'D', 'name': '买麻藤目', 'latin': 'Gnetales',
'families': 1, 'genera': 1, 'species': 40, 'error': 5, 'family_list': []},
}
for k, v in gymno_orders.items():
v['modeGenus'] = ''
orders_data[k] = v
print(f'Total orders (incl. gymnosperms): {len(orders_data)}')
# ─── 2. Parse 科视角 for ALL family data (use order name as key) ───
for s in sections:
if s.startswith('科视角'):
lines = s.strip().split('\n')
for line in lines:
if line.startswith('|') and '---' not in line:
cols = [c.strip() for c in line.split('|')]
if len(cols) > 5:
order_name = cols[2].strip()
fam_raw = cols[4].strip()
if (order_name and order_name != 'nan' and '' in order_name
and order_name != '目名'
and fam_raw and fam_raw != 'nan'):
# Parse family: "无油樟科 Amborellaceae" or "无油樟科 Amborellaceae Pers. (1807)"
# Split on whitespace (including full-width space)
parts = re.split(r'[\s ]+', fam_raw)
fam_cn = parts[0] if parts else fam_raw
fam_latin = parts[1] if len(parts) > 1 else ''
mode_genus = cols[5].strip() if len(cols) > 5 else ''
# Collect description from remaining cols
desc_parts = []
for ci in range(6, min(len(cols), 12)):
val = cols[ci].strip()
if val and val != 'nan':
desc_parts.append(val)
desc = ' '.join(desc_parts)
if order_name in orders_data:
orders_data[order_name]['family_list'].append({
'name': fam_cn,
'latin': fam_latin,
'modeGenus': mode_genus if mode_genus != 'nan' else '',
'description': desc
})
break
total_fam = sum(len(o['family_list']) for o in orders_data.values())
print(f'Total families parsed: {total_fam}')
# ─── 3. Load existing JSON for game-specific data ───
with open('Dashboard/data/n01_phylums.json', 'r', encoding='utf-8') as f:
existing = json.load(f)
existing_orders = {}
for clade in existing['data']:
for order in clade.get('orders', []):
existing_orders[order['id']] = order
# ─── 4. Clade mapping ───
num_to_name = {v['num']: k for k, v in orders_data.items()}
clade_map = {}
# ANITA: orders 1-3
for i in range(1, 4):
if i in num_to_name:
clade_map[num_to_name[i]] = ('ANITA', '基部被子植物', '演化支')
# Magnoliids: orders 4-8
for i in range(4, 9):
if i in num_to_name:
clade_map[num_to_name[i]] = ('Magnoliids', '木兰类植物', '类群')
# Monocots: orders 9-19
for i in range(9, 20):
if i in num_to_name:
clade_map[num_to_name[i]] = ('Monocots', '单子叶植物', '类群')
# Ceratophyllales: order 20
for i in range(20, 21):
if i in num_to_name:
clade_map[num_to_name[i]] = ('Ceratophyllales_clade', '金鱼藻类', '演化支')
# Eudicots: orders 21-64
for i in range(21, 65):
if i in num_to_name:
clade_map[num_to_name[i]] = ('Eudicots', '真双子叶植物', '类群')
# Gymnosperms: orders 65-72
for i in range(65, 73):
if i in num_to_name:
clade_map[num_to_name[i]] = ('Gymnosperms', '裸子植物', '类群')
# ─── 5. Build output ───
# Preserve clade order
clade_order = ['ANITA', 'Magnoliids', 'Monocots', 'Ceratophyllales_clade', 'Eudicots', 'Gymnosperms']
clades_output = {}
for cid in clade_order:
clades_output[cid] = None # placeholder
desc_map = {
'ANITA': '最原始的被子植物分支,包含无油樟、睡莲和木兰藤',
'Magnoliids': '原始的被子植物类群,包含木兰、樟、胡椒等',
'Monocots': '单子叶植物,包含禾本、兰花、棕榈等',
'Ceratophyllales_clade': '金鱼藻类,水生植物,系统位置特殊',
'Eudicots': '最大的被子植物类群占被子植物75%以上',
'Gymnosperms': '古老的种子植物,包含松、杉、银杏、苏铁'
}
for cid in clade_order:
clades_output[cid] = {
'id': cid,
'name': desc_map.get(cid, '').split('')[0] if cid in desc_map else cid,
'rank': '演化支' if cid in ('ANITA', 'Ceratophyllales_clade') else '类群',
'description': desc_map.get(cid, ''),
'orders': []
}
# Set proper names
clades_output['ANITA']['name'] = '基部被子植物'
clades_output['Magnoliids']['name'] = '木兰类植物'
clades_output['Monocots']['name'] = '单子叶植物'
clades_output['Ceratophyllales_clade']['name'] = '金鱼藻类'
clades_output['Eudicots']['name'] = '真双子叶植物'
clades_output['Gymnosperms']['name'] = '裸子植物'
# Sort orders by num and assign to clades
for order_name, order in sorted(orders_data.items(), key=lambda x: x[1]['num']):
clade_id = clade_map.get(order_name, ('Unknown', '未分类', '类群'))[0]
if clade_id not in clades_output:
clades_output[clade_id] = {
'id': clade_id, 'name': '未分类', 'rank': '类群',
'description': '', 'orders': []
}
# Get existing game data
existing_order = existing_orders.get(order['latin'], {})
# Build children (families)
children = []
for fam in order['family_list']:
child = {
'id': fam['latin'] if fam['latin'] else fam['name'],
'name': fam['name'],
'rank': '',
}
if fam['latin']:
child['latin'] = fam['latin']
if fam['modeGenus']:
child['modeGenus'] = fam['modeGenus']
if fam['description']:
child['description'] = fam['description']
children.append(child)
order_entry = {
'id': order['latin'],
'name': order['name'],
'rank': '',
'latin': order['latin'],
'chineseName': order['name'],
'population': order['species'],
'rankLevel': species_to_rank(order['species']),
'stats': {
'families': order['families'],
'genera': order['genera'],
'species': order['species'],
'error': order['error']
},
'children': children
}
# Copy game data from existing if available
if existing_order.get('leader'):
order_entry['leader'] = existing_order['leader']
if existing_order.get('territory'):
order_entry['territory'] = existing_order['territory']
if existing_order.get('features'):
order_entry['features'] = existing_order['features']
clades_output[clade_id]['orders'].append(order_entry)
# ─── 6. Output ───
output = {
'project': 'N01_植物帝国企划',
'description': '基于APG IV植物分类系统的AVG游戏阵营设定',
'version': '2.0.0',
'lastUpdated': '2026-04-19',
'hierarchy': ['演化支', '类群', '', ''],
'data': list(clades_output.values())
}
total_orders = sum(len(c['orders']) for c in output['data'])
total_families = sum(len(o['children']) for c in output['data'] for o in c['orders'])
total_species = sum(o['stats']['species'] for c in output['data'] for o in c['orders'])
print(f'\n=== Final Output ===')
print(f'Clades: {len(output["data"])}')
print(f'Orders: {total_orders}')
print(f'Families: {total_families}')
print(f'Species: {total_species:,}')
with open('Dashboard/data/n01_phylums.json', 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print('\nSaved to Dashboard/data/n01_phylums.json')