"""
教育用再現コード: 2023年度 統計データ分析コンペティション 審査員奨励賞 [高校生の部]
=================================================================
論文タイトル：熱中症を防ごう！
区分：高校生の部

【分析概要】
  データ：SSDSE-B-2026.csv（都道府県パネルデータ 47都道府県 × 2012-2023年度）
          2022年度 47都道府県の断面データ（散布図・相関分析）
          2012-2023年 全国平均の時系列（気温トレンド）

  目的変数（熱中症リスク代理指標）：
    最高気温 B4102 — 日最高気温の月平均の最高値（℃）
    保健医療費割合 L322106/L3221 × 100（%）— 健康負担の代理指標

  説明変数：
    B4101 : 年平均気温（℃）
    B4102 : 最高気温（℃）
    B4103 : 最低気温（℃）
    B4109 : 年降水量（mm）
    B4106 : 降水日数（年間）
    A1303/A1101 × 100 : 高齢化率（65歳以上人口比率、%）— 熱中症高リスク群
    L322106/L3221 × 100 : 保健医療費割合（%）— 医療負担指標
    I510120/A1101 × 10000 : 病院密度（人口1万人あたり病院数）

  Step1. 散布図：最高気温 × 保健医療費割合（47都道府県）
  Step2. 散布図：高齢化率 × 保健医療費割合（47都道府県）
  Step3. 時系列：2012-2023年 全国平均気温トレンド（B4101）
  Step4. 相関ヒートマップ：気温・高齢化・医療変数の相関行列

【データサイエンス学習ポイント】
  1. 気候データと健康指標の相関分析
  2. 都道府県別パネルデータの活用
  3. 高齢化と熱中症リスクの関係
  4. 重回帰分析による要因の特定

【データ出典】
  SSDSE-B-2026.csv: 社会・人口統計体系（都道府県データ）
  統計センターより公表の実データ（合成データ・乱数生成は一切使用しない）
=================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2023_H5_3_shorei.py
# ============================================================


import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

# ──────────────────────────────────────────────────────────────
# 共通設定
# ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

import os
FIG_DIR = os.path.normpath('html/figures') + os.sep
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ================================================================
# ■ 実データ読み込み（SSDSE-B-2026: 都道府県データ）
# ================================================================
# header=0 で変数コード行を列名として読み込む
df_raw = pd.read_csv(DATA_B, encoding='cp932', header=0)

# 最初の行（日本語ラベル）を除外し、都道府県行のみ抽出（R+5桁数字）
df_all = df_raw.iloc[1:].copy()
df_all = df_all[df_all['Code'].str.match(r'^R\d{5}$', na=False)].copy()

# 必要列を数値変換
num_cols = ['SSDSE-B-2026',
            'A1101',  # 総人口
            'A1303',  # 65歳以上人口
            'B4101',  # 年平均気温
            'B4102',  # 最高気温
            'B4103',  # 最低気温
            'B4106',  # 降水日数
            'B4109',  # 年降水量
            'L3221',  # 消費支出（二人以上の世帯）
            'L322106',# 保健医療費（二人以上の世帯）
            'I510120', # 一般病院数
            ]
for c in num_cols:
    if c in df_all.columns:
        df_all[c] = pd.to_numeric(df_all[c], errors='coerce')

# ================================================================
# ■ 2022年度 断面データ（47都道府県）
# ================================================================
df_2022 = df_all[df_all['SSDSE-B-2026'] == 2022].copy().reset_index(drop=True)

print("=" * 60)
print(f"■ 2022年度 都道府県数: N={len(df_2022)}")
print("=" * 60)

# 派生変数の計算
pop    = df_2022['A1101'].clip(lower=1)   # 総人口
elder  = df_2022['A1303']                 # 65歳以上人口
hospit = df_2022['I510120']               # 一般病院数
spend  = df_2022['L3221']                 # 消費支出
health = df_2022['L322106']               # 保健医療費

# 高齢化率（%）
aging_rate = (elder / pop * 100).values

# 保健医療費割合（%）: 消費支出に占める保健医療費の割合
health_ratio = (health / spend * 100).values

# 病院密度（人口1万人あたり病院数）
hosp_density = (hospit / pop * 10000).values

# 気温変数
temp_avg  = df_2022['B4101'].values    # 年平均気温
temp_max  = df_2022['B4102'].values    # 最高気温
temp_min  = df_2022['B4103'].values    # 最低気温
rain_days = df_2022['B4106'].values    # 降水日数
rain_vol  = df_2022['B4109'].values    # 年降水量

# 都道府県名（Prefecture列に英語、日本語ラベルはRow0から）
pref_names = df_2022['Prefecture'].values

# 有効行（欠損除外）
valid_mask = (
    np.isfinite(temp_max) & np.isfinite(health_ratio) &
    np.isfinite(aging_rate) & np.isfinite(hosp_density) &
    np.isfinite(temp_avg) & np.isfinite(temp_min) &
    np.isfinite(rain_days) & np.isfinite(rain_vol)
)
N = valid_mask.sum()
print(f"有効都道府県数（欠損除外後）: N={N}")

# フィルタ済みデータ
_tm  = temp_max[valid_mask]
_ta  = temp_avg[valid_mask]
_tn  = temp_min[valid_mask]
_rd  = rain_days[valid_mask]
_rv  = rain_vol[valid_mask]
_ar  = aging_rate[valid_mask]
_hr  = health_ratio[valid_mask]
_hd  = hosp_density[valid_mask]
_pn  = pref_names[valid_mask]

# 相関分析結果の出力
print("\n" + "=" * 60)
print("■ 相関分析（2022年度 47都道府県）")
print("=" * 60)
print(f"\n  {'変数ペア':<35} {'r':>8} {'p値':>10} {'有意':>6}")
print("  " + "-" * 63)

pairs = [
    ('最高気温 × 保健医療費割合', _tm, _hr),
    ('年平均気温 × 保健医療費割合', _ta, _hr),
    ('高齢化率 × 保健医療費割合', _ar, _hr),
    ('最高気温 × 高齢化率', _tm, _ar),
    ('降水量 × 最高気温', _rv, _tm),
    ('降水日数 × 最高気温', _rd, _tm),
    ('病院密度 × 保健医療費割合', _hd, _hr),
]

for label, x, y in pairs:
    r, p = stats.pearsonr(x, y)
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'n.s.'
    print(f"  {label:<35} {r:>8.4f} {p:>10.4f} {sig:>6}")

# 重回帰分析（保健医療費割合 を最高気温・高齢化率・降水量で説明）
print("\n" + "=" * 60)
print("■ 重回帰分析: 保健医療費割合 ~ 最高気温 + 高齢化率 + 降水量")
print("=" * 60)

X_reg = sm.add_constant(np.column_stack([_tm, _ar, _rv]))
model = sm.OLS(_hr, X_reg).fit()
print(model.summary2())

# ================================================================
# ■ 時系列データ（2012-2023 全国平均）
# ================================================================
years_all = sorted(df_all['SSDSE-B-2026'].dropna().unique().astype(int))

ts_temp_avg = []
ts_temp_max = []

for yr in years_all:
    df_yr = df_all[df_all['SSDSE-B-2026'] == yr]
    ta = pd.to_numeric(df_yr['B4101'], errors='coerce').mean()
    tm = pd.to_numeric(df_yr['B4102'], errors='coerce').mean()
    ts_temp_avg.append(ta)
    ts_temp_max.append(tm)

ts_temp_avg = np.array(ts_temp_avg)
ts_temp_max = np.array(ts_temp_max)

print("\n" + "=" * 60)
print("■ 気温トレンド（全国47都道府県平均）")
print("=" * 60)
for yr, ta, tm in zip(years_all, ts_temp_avg, ts_temp_max):
    print(f"  {yr}年: 年平均気温={ta:.2f}℃, 最高気温={tm:.2f}℃")

# トレンド検定（Mann-Kendall 代わりにスピアマン相関）
rho_ta, p_ta = stats.spearmanr(years_all, ts_temp_avg)
rho_tm, p_tm = stats.spearmanr(years_all, ts_temp_max)
print(f"\n  年平均気温トレンド: ρ={rho_ta:.4f}, p={p_ta:.4f}")
print(f"  最高気温トレンド  : ρ={rho_tm:.4f}, p={p_tm:.4f}")

# ================================================================
# ■ 図1: 散布図 — 最高気温 × 保健医療費割合（47都道府県）
# ================================================================
fig1, ax1 = plt.subplots(figsize=(9, 6))

ax1.scatter(_tm, _hr, color='#E65100', s=55, alpha=0.75, edgecolors='white',
            linewidths=0.7, zorder=3, label='都道府県')

# 回帰直線
z1 = np.polyfit(_tm, _hr, 1)
xs1 = np.linspace(_tm.min(), _tm.max(), 200)
ax1.plot(xs1, np.poly1d(z1)(xs1), color='#C62828', linewidth=2.0,
         linestyle='--', alpha=0.85, label='回帰直線')

# 相関係数
r1, p1 = stats.pearsonr(_tm, _hr)
sig1 = '***' if p1 < 0.001 else '**' if p1 < 0.01 else '*' if p1 < 0.05 else 'n.s.'

# 注目都道府県をラベル表示（上位・下位）
combined1 = list(zip(_tm, _hr, _pn))
top3_heat  = sorted(combined1, key=lambda x: x[0], reverse=True)[:3]
top3_health = sorted(combined1, key=lambda x: x[1], reverse=True)[:3]
label_set1 = {p for _, _, p in top3_heat + top3_health}
for tx, ty, pn in combined1:
    if pn in label_set1:
        ax1.annotate(pn, (tx, ty), fontsize=8.5, xytext=(5, 4),
                     textcoords='offset points', color='#333')

ax1.set_xlabel('最高気温（日最高気温の月平均の最高値、℃）', fontsize=12)
ax1.set_ylabel('保健医療費割合（%）', fontsize=12)
ax1.set_title(f'最高気温と保健医療費割合の関係\n'
              f'r = {r1:.3f} ({sig1}), N = {N}都道府県（2022年度）',
              fontsize=13, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

# 解釈メモ
ax1.text(0.02, 0.98,
         '気温が高い地域では\n健康関連支出も高い傾向',
         transform=ax1.transAxes, fontsize=9,
         verticalalignment='top', color='#555',
         bbox=dict(boxstyle='round,pad=0.4', facecolor='#FFF3E0', alpha=0.85))

plt.tight_layout()
fig1.savefig(FIG_DIR + '2023_H5_3_fig1_temp_health.png', bbox_inches='tight', dpi=150)
plt.close(fig1)
print("\n図1保存: 2023_H5_3_fig1_temp_health.png")

# ================================================================
# ■ 図2: 散布図 — 高齢化率 × 保健医療費割合（47都道府県）
# ================================================================
fig2, ax2 = plt.subplots(figsize=(9, 6))

ax2.scatter(_ar, _hr, color='#1565C0', s=55, alpha=0.75, edgecolors='white',
            linewidths=0.7, zorder=3, label='都道府県')

z2 = np.polyfit(_ar, _hr, 1)
xs2 = np.linspace(_ar.min(), _ar.max(), 200)
ax2.plot(xs2, np.poly1d(z2)(xs2), color='#0D47A1', linewidth=2.0,
         linestyle='--', alpha=0.85, label='回帰直線')

r2, p2 = stats.pearsonr(_ar, _hr)
sig2 = '***' if p2 < 0.001 else '**' if p2 < 0.01 else '*' if p2 < 0.05 else 'n.s.'

combined2 = list(zip(_ar, _hr, _pn))
top3_aging = sorted(combined2, key=lambda x: x[0], reverse=True)[:3]
top3_health2 = sorted(combined2, key=lambda x: x[1], reverse=True)[:3]
label_set2 = {p for _, _, p in top3_aging + top3_health2}
for ax, ay, pn in combined2:
    if pn in label_set2:
        ax2.annotate(pn, (ax, ay), fontsize=8.5, xytext=(5, 4),
                     textcoords='offset points', color='#333')

ax2.set_xlabel('高齢化率（65歳以上人口比率、%）', fontsize=12)
ax2.set_ylabel('保健医療費割合（%）', fontsize=12)
ax2.set_title(f'高齢化率と保健医療費割合の関係\n'
              f'r = {r2:.3f} ({sig2}), N = {N}都道府県（2022年度）',
              fontsize=13, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(True, alpha=0.3)

ax2.text(0.02, 0.98,
         '高齢化率が高い地域ほど\n保健医療費割合が増大',
         transform=ax2.transAxes, fontsize=9,
         verticalalignment='top', color='#555',
         bbox=dict(boxstyle='round,pad=0.4', facecolor='#E3F2FD', alpha=0.85))

plt.tight_layout()
fig2.savefig(FIG_DIR + '2023_H5_3_fig2_aging_health.png', bbox_inches='tight', dpi=150)
plt.close(fig2)
print("図2保存: 2023_H5_3_fig2_aging_health.png")

# ================================================================
# ■ 図3: 時系列 — 2012-2023 全国平均気温トレンド（B4101）
# ================================================================
fig3, ax3 = plt.subplots(figsize=(11, 5))

# 年平均気温の推移
ax3.plot(years_all, ts_temp_avg, color='#1565C0', linewidth=2.5,
         marker='o', markersize=7, label='年平均気温', zorder=4)
ax3.fill_between(years_all, ts_temp_avg, alpha=0.15, color='#1565C0')

# トレンド直線（OLS）
z3 = np.polyfit(years_all, ts_temp_avg, 1)
xs3 = np.array(years_all)
ax3.plot(xs3, np.poly1d(z3)(xs3), color='#C62828', linewidth=1.8,
         linestyle='--', alpha=0.8, label=f'トレンド (傾き={z3[0]:.3f}℃/年)')

# 最高気温の副軸
ax3b = ax3.twinx()
ax3b.plot(years_all, ts_temp_max, color='#E65100', linewidth=2.0,
          marker='s', markersize=6, linestyle='-.', label='最高気温', zorder=3, alpha=0.85)
ax3b.set_ylabel('最高気温（℃）', fontsize=11, color='#E65100')
ax3b.tick_params(axis='y', labelcolor='#E65100')

ax3.set_xlabel('年度', fontsize=12)
ax3.set_ylabel('年平均気温（℃）', fontsize=12)
ax3.set_title('全国平均気温の推移（2012-2023年度）\n'
              '47都道府県平均 — SSDSE-B-2026 実データ',
              fontsize=13, fontweight='bold')
ax3.set_xticks(years_all)
ax3.set_xticklabels([str(y) for y in years_all], rotation=30, fontsize=9)
ax3.grid(True, alpha=0.3)

# 凡例をまとめる
lines1, labels1 = ax3.get_legend_handles_labels()
lines2, labels2 = ax3b.get_legend_handles_labels()
ax3.legend(lines1 + lines2, labels1 + labels2, fontsize=10, loc='upper left')

# スピアマン相関のメモ
ax3.text(0.98, 0.05,
         f'年平均気温トレンド\nρ={rho_ta:.3f}, p={p_ta:.3f}',
         transform=ax3.transAxes, fontsize=9,
         horizontalalignment='right', color='#333',
         bbox=dict(boxstyle='round,pad=0.4', facecolor='#E8F5E9', alpha=0.85))

plt.tight_layout()
fig3.savefig(FIG_DIR + '2023_H5_3_fig3_timeseries.png', bbox_inches='tight', dpi=150)
plt.close(fig3)
print("図3保存: 2023_H5_3_fig3_timeseries.png")

# ================================================================
# ■ 図4: 相関ヒートマップ（気温・高齢化・医療変数）
# ================================================================
var_labels = ['年平均気温', '最高気温', '最低気温', '降水日数', '降水量',
              '高齢化率', '保健医療費割合', '病院密度']

X_mat = np.column_stack([_ta, _tm, _tn, _rd, _rv, _ar, _hr, _hd])

# 欠損のある列を除外
valid_rows = np.all(np.isfinite(X_mat), axis=1)
X_mat_clean = X_mat[valid_rows]
n_vars = X_mat_clean.shape[1]

corr_mat = np.zeros((n_vars, n_vars))
pval_mat = np.zeros((n_vars, n_vars))
for i in range(n_vars):
    for j in range(n_vars):
        r_ij, p_ij = stats.pearsonr(X_mat_clean[:, i], X_mat_clean[:, j])
        corr_mat[i, j] = r_ij
        pval_mat[i, j] = p_ij

fig4, ax4 = plt.subplots(figsize=(10, 8))

# カラーマップ
im = ax4.imshow(corr_mat, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')
plt.colorbar(im, ax=ax4, label='ピアソン相関係数', fraction=0.046, pad=0.04)

ax4.set_xticks(range(n_vars))
ax4.set_yticks(range(n_vars))
ax4.set_xticklabels(var_labels, rotation=30, ha='right', fontsize=10)
ax4.set_yticklabels(var_labels, fontsize=10)

# セルに値を表示
for i in range(n_vars):
    for j in range(n_vars):
        r_val = corr_mat[i, j]
        p_val = pval_mat[i, j]
        sig_star = '***' if p_val < 0.001 else '**' if p_val < 0.01 else '*' if p_val < 0.05 else ''
        text_color = 'white' if abs(r_val) > 0.6 else 'black'
        ax4.text(j, i, f'{r_val:.2f}{sig_star}',
                 ha='center', va='center',
                 fontsize=8.5, color=text_color, fontweight='bold' if sig_star else 'normal')

ax4.set_title('相関ヒートマップ：気温・高齢化・医療変数（2022年度 47都道府県）\n'
              '* p<0.05  ** p<0.01  *** p<0.001',
              fontsize=12, fontweight='bold', pad=15)

plt.tight_layout()
fig4.savefig(FIG_DIR + '2023_H5_3_fig4_heatmap.png', bbox_inches='tight', dpi=150)
plt.close(fig4)
print("図4保存: 2023_H5_3_fig4_heatmap.png")

print("\n" + "=" * 60)
print("■ 全図の生成完了（4枚）")
print("=" * 60)
print("  fig1_temp_health.png  : 最高気温 × 保健医療費割合 散布図")
print("  fig2_aging_health.png : 高齢化率 × 保健医療費割合 散布図")
print("  fig3_timeseries.png   : 2012-2023年 全国平均気温トレンド")
print("  fig4_heatmap.png      : 相関ヒートマップ（7変数）")
print()
print("  データ：SSDSE-B-2026.csv 実データのみ使用（np.random 一切不使用）")