"""
教育用再現コード: 2024年 統計データ分析コンペティション 審査員奨励賞 [高校生の部]
=================================================================
論文タイトル：学力と外見への投資に関する回帰モデルを用いた分析
著者：倉本佳詩野（宮崎県立五ヶ瀬中等教育学校）

【分析概要】
  データ：SSDSE-B-2026.csv（2022年度、47都道府県）
  目的変数：大学進学率（高校卒業者のうち進学者数 / 高校卒業者数 × 100）
            ※ 論文の「学力」変数の代理指標として使用
  説明変数候補：
    消費支出       ：経済的余裕（収入の代理）
    被服費割合     ：被服及び履物費 / 消費支出 × 100（外見投資の代理）
    被服費         ：被服及び履物費（外見投資の絶対額）
    教育費割合     ：教育費 / 消費支出 × 100
    教養娯楽費     ：文化・教育的環境の代理
    合計特殊出生率 ：家族構造
    降水日数       ：気候（屋外活動への影響）
    高齢化率       ：65歳以上人口 / 総人口 × 100

  Step1. 全変数とのPearson相関係数
  Step2. ステップワイズ変数選択（バックワードAIC基準）
  Step3. 最終モデルの標準化回帰係数
  Step4. 消費支出 vs 大学進学率の散布図

【主要な発見】
  ステップワイズによりAIC最小モデルを選択
  被服費割合は非有意（仮説棄却）
  消費支出（経済的余裕）が学力代理指標（大学進学率）と正相関

【データサイエンス学習ポイント】
  1. ステップワイズ変数選択（バックワード）とAIC
  2. 標準化回帰係数による変数重要度比較
  3. RMSE算出
  4. 実公的データ（SSDSE）を用いた回帰分析の実践
=================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2024_H5_3_shorei.py
# ============================================================


import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import warnings
import os

warnings.filterwarnings('ignore')

# ──────────────────────────────────────────────────────────────
# 共通設定
# ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

FIG_DIR = 'html/figures'
DATA_PATH = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ================================================================
# ■ データ読み込み（SSDSE-B-2026、2022年度 47都道府県）
# ================================================================
df_b = pd.read_csv(DATA_PATH, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}$', na=False)]
df = df_b[df_b['年度'] == 2022].copy().reset_index(drop=True)

# ── 変数構築 ───────────────────────────────────────────────────
# 目的変数：大学進学率（学力の代理指標）
df['大学進学率'] = (
    df['高等学校卒業者のうち進学者数'] / df['高等学校卒業者数'] * 100
)

# 説明変数
df['消費支出']       = df['消費支出（二人以上の世帯）']             # 経済的余裕
df['被服費割合']     = df['被服及び履物費（二人以上の世帯）'] / df['消費支出（二人以上の世帯）'] * 100  # 外見投資比率
df['被服費']         = df['被服及び履物費（二人以上の世帯）']       # 外見投資絶対額
df['教育費割合']     = df['教育費（二人以上の世帯）'] / df['消費支出（二人以上の世帯）'] * 100
df['教養娯楽費']     = df['教養娯楽費（二人以上の世帯）']
df['合計特殊出生率'] = df['合計特殊出生率']
df['降水日数']       = df['降水日数（年間）']
df['高齢化率']       = df['65歳以上人口'] / df['総人口'] * 100

VAR_NAMES = ['消費支出', '被服費割合', '被服費', '教育費割合',
             '教養娯楽費', '合計特殊出生率', '降水日数', '高齢化率']

OUTCOME = '大学進学率'

X_df = df[VAR_NAMES].copy()
y    = df[OUTCOME].copy()

print("=" * 60)
print(f"■ データ概要: N={len(df)}都道府県（SSDSE-B-2026、2022年度）")
print("=" * 60)
print(df[[OUTCOME] + VAR_NAMES].describe().round(2))

# ================================================================
# ■ Step1. 全変数とのPearson相関
# ================================================================
print("\n" + "=" * 60)
print(f"■ Step1. 全変数とのPearson相関（目的変数：{OUTCOME}）")
print("=" * 60)
print(f"\n  {'変数':<16} {'r':>8} {'p値':>10} {'有意':>6}")
print("  " + "-" * 44)

corrs_all = []
pvals_all = []
for name in VAR_NAMES:
    r, p = stats.pearsonr(X_df[name], y)
    corrs_all.append(r)
    pvals_all.append(p)
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'n.s.'
    print(f"  {name:<16} {r:>8.4f} {p:>10.4f} {sig:>6}")

# ================================================================
# ■ Step2. ステップワイズ（バックワード AIC基準）
# ================================================================
print("\n" + "=" * 60)
print("■ Step2. ステップワイズ変数選択（AIC基準・バックワード）")
print("=" * 60)


def backward_aic(X, y):
    """バックワード法（AIC基準）: statsmodels OLS を使用"""
    cols = list(X.columns)
    aic_history = []
    step_labels = []

    # 全変数モデルのAIC
    model = sm.OLS(y, sm.add_constant(X[cols])).fit()
    aic_history.append(model.aic)
    step_labels.append(f'全変数\n({len(cols)}変数)')

    while len(cols) > 1:
        # 各変数を1つ除いたときのAICを計算
        aics = {}
        for c in cols:
            sub = [x for x in cols if x != c]
            m = sm.OLS(y, sm.add_constant(X[sub])).fit()
            aics[c] = m.aic

        # 除いたとき最もAICが改善する変数
        worst = min(aics, key=lambda k: aics[k])

        if aics[worst] < model.aic:
            cols.remove(worst)
            model = sm.OLS(y, sm.add_constant(X[cols])).fit()
            aic_history.append(model.aic)
            step_labels.append(f'-{worst}\n({len(cols)}変数)')
            print(f"  除外: {worst:<16}（AIC: {aic_history[-2]:.2f} → {aic_history[-1]:.2f}）")
        else:
            break

    return cols, aic_history, step_labels, model


selected_cols, aic_hist, step_lbls, final_model = backward_aic(X_df, y)

rmse = float(np.sqrt(np.mean((y - final_model.fittedvalues) ** 2)))

print(f"\n  最終選択変数: {selected_cols}")
print(f"  R²   = {final_model.rsquared:.4f}")
print(f"  RMSE = {rmse:.4f}")

# ================================================================
# ■ 標準化回帰係数
# ================================================================
scaler = StandardScaler()
X_sel_std = scaler.fit_transform(X_df[selected_cols])
y_std = (y - y.mean()) / y.std()
model_std = sm.OLS(y_std, sm.add_constant(X_sel_std)).fit()
std_coefs = model_std.params[1:]
std_pvals = model_std.pvalues[1:]
std_cis   = model_std.conf_int().iloc[1:]   # 95% CI for standardized coefs

print("\n  標準化回帰係数（最終モデル）:")
for name, coef, pv in zip(selected_cols, std_coefs, std_pvals):
    sig = '***' if pv < 0.001 else '**' if pv < 0.01 else '*' if pv < 0.05 else 'n.s.'
    print(f"    {name:<16}: β={coef:.4f} ({sig})")

# ================================================================
# ■ 図の生成（4枚）
# ================================================================

# ────────────────────────────────────────────────────────────────
# 図1: 全変数とのPearson相関係数棒グラフ
# ────────────────────────────────────────────────────────────────
fig1, ax1 = plt.subplots(figsize=(9, 5.5))

y_pos1 = np.arange(len(VAR_NAMES))
cols1 = ['#C62828' if p < 0.05 else '#9E9E9E' for p in pvals_all]
ax1.barh(y_pos1, corrs_all, color=cols1, alpha=0.78,
         edgecolor='white', height=0.6)
ax1.axvline(0, color='gray', linestyle='--', linewidth=1.0)
ax1.set_yticks(y_pos1)
ax1.set_yticklabels(VAR_NAMES, fontsize=10)
ax1.set_xlabel(f'ピアソン相関係数（目的変数：{OUTCOME}）', fontsize=11)
ax1.set_title(f'各変数と{OUTCOME}の相関係数\n（赤=p<0.05 有意、灰=非有意）',
              fontsize=12, fontweight='bold')
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3)

for i, (r, p) in enumerate(zip(corrs_all, pvals_all)):
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
    offset = 0.02 if r >= 0 else -0.02
    ha = 'left' if r >= 0 else 'right'
    ax1.text(r + offset, i, f'{r:.3f}{sig}', va='center', ha=ha, fontsize=8.5)

# 被服費割合に注記（外見投資は非有意）
idx_clothe = VAR_NAMES.index('被服費割合')
ax1.annotate('被服費割合は非有意\n（仮説棄却）',
             xy=(corrs_all[idx_clothe], idx_clothe),
             xytext=(corrs_all[idx_clothe] + 0.15, idx_clothe + 1.5),
             fontsize=8, color='#555',
             arrowprops=dict(arrowstyle='->', color='#888', lw=0.8))

plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2024_H5_3_fig1_corr.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig1)
print("\n図1保存: 2024_H5_3_fig1_corr.png")

# ────────────────────────────────────────────────────────────────
# 図2: ステップワイズのAIC推移
# ────────────────────────────────────────────────────────────────
fig2, ax2 = plt.subplots(figsize=(9, 5))

x_pos2 = np.arange(len(aic_hist))
ax2.plot(x_pos2, aic_hist, 'o-', color='#1565C0', linewidth=2, markersize=8)
ax2.fill_between(x_pos2, min(aic_hist) - 5, aic_hist,
                 alpha=0.12, color='#1565C0')

best_step = int(np.argmin(aic_hist))
ax2.scatter([best_step], [aic_hist[best_step]],
            color='#C62828', s=150, zorder=5,
            label=f'最小AIC={aic_hist[best_step]:.2f}\n({step_lbls[best_step].split(chr(10))[0]})')

ax2.set_xticks(x_pos2)
ax2.set_xticklabels(step_lbls, fontsize=8.5)
ax2.set_ylabel('AIC', fontsize=11)
ax2.set_title('ステップワイズ変数選択（バックワード）のAIC推移\n'
              '変数を除くごとにAICが下がれば採用',
              fontsize=12, fontweight='bold')
ax2.legend(fontsize=9)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2024_H5_3_fig2_stepwise.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig2)
print("図2保存: 2024_H5_3_fig2_stepwise.png")

# ────────────────────────────────────────────────────────────────
# 図3: 最終モデルの標準化回帰係数（95% CI付き）
# ────────────────────────────────────────────────────────────────
fig3, ax3 = plt.subplots(figsize=(8, 5))

y_pos3 = np.arange(len(selected_cols))
cols3 = ['#C62828' if c > 0 else '#1565C0' for c in std_coefs]
ax3.barh(y_pos3, std_coefs, color=cols3, alpha=0.78,
         edgecolor='white', height=0.55)

# 95% CI のエラーバー
xerr_lo = std_coefs.values - std_cis.iloc[:, 0].values
xerr_hi = std_cis.iloc[:, 1].values - std_coefs.values
ax3.errorbar(std_coefs.values, y_pos3,
             xerr=[xerr_lo, xerr_hi],
             fmt='none', color='#333', linewidth=1.2, capsize=4)

ax3.axvline(0, color='gray', linestyle='--', linewidth=1.0)
ax3.set_yticks(y_pos3)
ax3.set_yticklabels(selected_cols, fontsize=10)
ax3.set_xlabel('標準化回帰係数 β', fontsize=11)
ax3.set_title(
    f'最終モデルの標準化回帰係数（95% CI）\n'
    f'R²={final_model.rsquared:.3f}、RMSE={rmse:.2f}%',
    fontsize=12, fontweight='bold'
)
ax3.invert_yaxis()
ax3.grid(axis='x', alpha=0.3)

for i, (coef, pv) in enumerate(zip(std_coefs, std_pvals)):
    sig = '***' if pv < 0.001 else '**' if pv < 0.01 else '*' if pv < 0.05 else 'n.s.'
    offset = 0.01 if coef >= 0 else -0.01
    ha = 'left' if coef >= 0 else 'right'
    ax3.text(coef + offset, i, f'β={coef:.3f} {sig}',
             va='center', ha=ha, fontsize=8.5)

from matplotlib.patches import Patch
ax3.legend(handles=[Patch(color='#C62828', alpha=0.78, label='正の効果'),
                    Patch(color='#1565C0', alpha=0.78, label='負の効果')],
           fontsize=9, loc='lower right')
plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2024_H5_3_fig3_coef.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig3)
print("図3保存: 2024_H5_3_fig3_coef.png")

# ────────────────────────────────────────────────────────────────
# 図4: 消費支出 vs 大学進学率の散布図
# ────────────────────────────────────────────────────────────────
fig4, ax4 = plt.subplots(figsize=(8, 6))

x4 = df['消費支出'].values
y4 = y.values
pref_names = df['都道府県'].values

ax4.scatter(x4, y4, color='#6A1B9A', s=60, alpha=0.7, zorder=3,
            label='各都道府県')

# 回帰直線
z4 = np.polyfit(x4, y4, 1)
xs4 = np.linspace(x4.min() - 2000, x4.max() + 2000, 200)
ax4.plot(xs4, np.poly1d(z4)(xs4), color='#C62828', linewidth=2,
         alpha=0.8, label=f'回帰直線: β={z4[0]*10000:.3f}（万円単位換算）')

r4, p4 = stats.pearsonr(x4, y4)
ax4.set_xlabel('消費支出（円/月、二人以上世帯）', fontsize=11)
ax4.set_ylabel(f'{OUTCOME}（%）', fontsize=11)
ax4.set_title(f'消費支出と{OUTCOME}の関係\nr={r4:.2f}, p={p4:.4f}',
              fontsize=12, fontweight='bold')

# 注記テキスト
ax4.text(0.05, 0.92,
         '消費支出（経済的余裕）は\n大学進学率と正相関\n（経済力 → 教育投資）',
         transform=ax4.transAxes, fontsize=9, color='#444',
         bbox=dict(facecolor='#E8F5E9', edgecolor='#2E7D32',
                   boxstyle='round,pad=0.5'))

# 外れ値ラベル（上位・下位各2件）
combined = list(zip(y4, x4, pref_names))
top2    = sorted(combined, key=lambda t: t[0], reverse=True)[:2]
bottom2 = sorted(combined, key=lambda t: t[0])[:2]
for yv, xv, pname in top2 + bottom2:
    ax4.annotate(pname, xy=(xv, yv), xytext=(6, 0),
                 textcoords='offset points', fontsize=7.5,
                 color='#333')

ax4.legend(fontsize=9)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2024_H5_3_fig4_scatter.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig4)
print("図4保存: 2024_H5_3_fig4_scatter.png")

print("\n全図の生成完了（4枚）")
print(f"\n■ 最終モデルサマリ")
print(f"  選択変数: {selected_cols}")
print(f"  R²={final_model.rsquared:.4f}, RMSE={rmse:.4f}%")