"""
教育用再現コード: 2024年 統計データ分析コンペティション 審査員奨励賞 [高校生の部]
=================================================================
論文タイトル：都市部とへき地の生徒間の英語能力の格差を是正するためには
著者：井上咲春（名古屋大学教育学部附属高等学校）

【分析概要】
  データ：SSDSE-B-2026.csv（2022年度）
  目的変数：大学進学率（高等学校卒業者のうち進学者数 / 高等学校卒業者数 × 100）
            ※英語正答率の公的都道府県別データが未公開のため、学力の代理指標として使用
  説明変数：1人当たり県民所得（SSDSE-E）, 教育費（二人以上の世帯）,
            保育所充実度（保育所等数/総人口×1000）, 年平均気温, 高齢化率

  Step1. 単回帰: 大学進学率 ~ log(1人当たり県民所得)
  Step2. 重回帰: 多変数モデル
  Step3. Pearson相関表
  Step4. 地域別棒グラフ（8地域区分）

【データサイエンス学習ポイント】
  1. 相関係数とp値の解釈
  2. 単回帰分析と決定係数（R²）
  3. 対数変換による非線形関係の線形化
  4. 「統計的有意」と「実質的意義」の違い
=================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#     ・SSDSE-E-2026.csv
#       → data/raw/SSDSE-E-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#     （SSDSE-E（社会・人口統計体系 都道府県の指標2） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2024_H5_2_shorei.py
# ============================================================


import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

# ──────────────────────────────────────────────────────────────
# 共通設定
# ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

import os
FIG_DIR = 'html/figures'
os.makedirs(FIG_DIR, exist_ok=True)

# ================================================================
# ■ データ読み込み: SSDSE-B 2022
# ================================================================
df_b = pd.read_csv(
    'data/raw/SSDSE-B-2026.csv',
    encoding='cp932', header=1
)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}$', na=False)]
df_2022 = df_b[df_b['年度'] == 2022].copy().reset_index(drop=True)

for col in ['総人口', '65歳以上人口',
            '高等学校卒業者数', '高等学校卒業者のうち進学者数',
            '教育費（二人以上の世帯）', '保育所等数',
            '年平均気温', 'ごみのリサイクル率', '婚姻件数']:
    df_2022[col] = pd.to_numeric(df_2022[col], errors='coerce')

# ================================================================
# ■ データ読み込み: SSDSE-E（1人当たり県民所得）
# ================================================================
df_e_raw = pd.read_csv(
    'data/raw/SSDSE-E-2026.csv',
    encoding='cp932', header=0
)
df_e = df_e_raw.iloc[1:].copy(); df_e.columns = df_e_raw.iloc[0].values
df_e2 = df_e.iloc[1:].copy(); df_e2.columns = df_e.iloc[0].values
df_e2 = df_e2[df_e2['都道府県'] != '全国'].reset_index(drop=True)
df_e2['1人当たり県民所得（平成27年基準）'] = pd.to_numeric(
    df_e2['1人当たり県民所得（平成27年基準）'], errors='coerce')

# ================================================================
# ■ 変数構築
# ================================================================
# SSDSE-B列名の都道府県を揃えてマージ
df_2022['都道府県_key'] = df_2022['都道府県'].str.replace('県$|府$|都$|道$', '', regex=True)
df_e2['都道府県_key'] = df_e2['都道府県'].str.replace('県$|府$|都$|道$', '', regex=True)
df = df_2022.merge(
    df_e2[['都道府県_key', '1人当たり県民所得（平成27年基準）']],
    on='都道府県_key', how='left'
)

# 目的変数: 大学進学率 (%)
df['大学進学率'] = df['高等学校卒業者のうち進学者数'] / df['高等学校卒業者数'] * 100

# 説明変数
df['高齢化率'] = df['65歳以上人口'] / df['総人口'] * 100
df['保育所充実度'] = df['保育所等数'] / df['総人口'] * 1000
df['ln所得'] = np.log(df['1人当たり県民所得（平成27年基準）'])
df['ln人口密度'] = np.log(
    df['総人口'] / (df_2022['総人口'] / df_2022['総人口'])  # dummy placeholder
)

# 都道府県名（表示用）
PREFS = df['都道府県'].tolist()

# 有効行のみ抽出
cols_needed = ['大学進学率', 'ln所得', '教育費（二人以上の世帯）',
               '保育所充実度', '年平均気温', '高齢化率',
               '1人当たり県民所得（平成27年基準）']
df_ana = df[cols_needed + ['都道府県']].dropna().reset_index(drop=True)
N = len(df_ana)

# 配列として取り出す
y = df_ana['大学進学率'].values
ln_income = df_ana['ln所得'].values
edu_exp = df_ana['教育費（二人以上の世帯）'].values
childcare = df_ana['保育所充実度'].values
temperature = df_ana['年平均気温'].values
aging = df_ana['高齢化率'].values
income_raw = df_ana['1人当たり県民所得（平成27年基準）'].values
prefs = df_ana['都道府県'].tolist()

print("=" * 60)
print(f"■ データ概要: N={N}都道府県（SSDSE-B 2022 / SSDSE-E）")
print("=" * 60)
print(df_ana[['大学進学率', '1人当たり県民所得（平成27年基準）',
              '教育費（二人以上の世帯）', '保育所充実度',
              '年平均気温', '高齢化率']].describe().round(2))

# ================================================================
# ■ Step1. 単回帰: 大学進学率 ~ ln(1人当たり県民所得)
# ================================================================
print("\n" + "=" * 60)
print("■ Step1. 単回帰: 大学進学率 ~ ln(1人当たり県民所得)")
print("=" * 60)
X1 = sm.add_constant(ln_income)
model_simple = sm.OLS(y, X1).fit()
print(model_simple.summary2())

# ================================================================
# ■ Step2. 重回帰
# ================================================================
print("\n" + "=" * 60)
print("■ Step2. 重回帰（多変数モデル）")
print("=" * 60)
X2 = sm.add_constant(
    np.column_stack([ln_income, edu_exp, childcare, temperature, aging])
)
model_multi = sm.OLS(y, X2).fit()
print(model_multi.summary2())

# ================================================================
# ■ Step3. Pearson相関表
# ================================================================
print("\n" + "=" * 60)
print("■ Step3. Pearson相関（大学進学率との相関）")
print("=" * 60)
var_list = {
    'ln(1人当たり県民所得)': ln_income,
    '教育費': edu_exp,
    '保育所充実度': childcare,
    '年平均気温': temperature,
    '高齢化率': aging,
}
print(f"\n  {'変数':<22} {'r':>8} {'p値':>10} {'有意':>6}")
print("  " + "-" * 50)
for name, x in var_list.items():
    r, p = stats.pearsonr(x, y)
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'n.s.'
    print(f"  {name:<22} {r:>8.4f} {p:>10.4f} {sig:>6}")

# ================================================================
# ■ 8地域区分
# ================================================================
REGION_MAP = {
    '北海道': '北海道', '青森県': '東北', '岩手県': '東北', '宮城県': '東北',
    '秋田県': '東北', '山形県': '東北', '福島県': '東北',
    '茨城県': '関東', '栃木県': '関東', '群馬県': '関東', '埼玉県': '関東',
    '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国', '島根県': '中国', '岡山県': '中国', '広島県': '中国', '山口県': '中国',
    '徳島県': '四国', '香川県': '四国', '愛媛県': '四国', '高知県': '四国',
    '福岡県': '九州', '佐賀県': '九州', '長崎県': '九州', '熊本県': '九州',
    '大分県': '九州', '宮崎県': '九州', '鹿児島県': '九州', '沖縄県': '九州',
}
df_ana['地域'] = df_ana['都道府県'].map(REGION_MAP)

# ================================================================
# ■ 図の生成（4枚）
# ================================================================

# ────────────────────────────────────────────────────────────────
# 図1: 散布図 ln(所得) vs 大学進学率 + 回帰直線
# ────────────────────────────────────────────────────────────────
fig1, ax1 = plt.subplots(figsize=(8, 6))

ax1.scatter(ln_income, y, color='#1565C0', s=55, alpha=0.7, zorder=3, label='各都道府県')

# 東京・沖縄・愛知をハイライト
highlights = {'東京都': None, '沖縄県': None, '愛知県': None, '青森県': None}
for pref, (xi, yi) in {
    p: (ln_income[i], y[i]) for i, p in enumerate(prefs) if p in highlights
}.items():
    ax1.scatter(xi, yi, color='#C62828', s=100, zorder=5)
    ax1.annotate(pref.replace('都', '').replace('府', '').replace('県', ''),
                 (xi, yi), textcoords='offset points', xytext=(5, 4),
                 fontsize=8.5, color='#C62828')

# 回帰直線
xs = np.linspace(ln_income.min() - 0.05, ln_income.max() + 0.05, 200)
slope = model_simple.params[1]
intercept = model_simple.params[0]
ax1.plot(xs, intercept + slope * xs, 'r-', linewidth=2, alpha=0.8,
         label=f'回帰直線 (R²={model_simple.rsquared:.3f})')

pred = model_simple.get_prediction(sm.add_constant(xs))
ci = pred.conf_int(alpha=0.05)
ax1.fill_between(xs, ci[:, 0], ci[:, 1], alpha=0.12, color='red', label='95%信頼区間')

r1, p1 = stats.pearsonr(ln_income, y)
ax1.set_xlabel('ln(1人当たり県民所得) [千円, 平成27年基準]', fontsize=11)
ax1.set_ylabel('大学進学率（%）', fontsize=11)
ax1.set_title(f'1人当たり県民所得（対数）と大学進学率\nr={r1:.3f}, p={p1:.4f}',
              fontsize=12, fontweight='bold')
ax1.legend(fontsize=9)
ax1.grid(True, alpha=0.3)

plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2024_H5_2_fig1_scatter_income.png'), bbox_inches='tight', dpi=150)
plt.close(fig1)
print("\n図1保存: 2024_H5_2_fig1_scatter_income.png")

# ────────────────────────────────────────────────────────────────
# 図2: 散布図 教育費 vs 大学進学率
# ────────────────────────────────────────────────────────────────
fig2, ax2 = plt.subplots(figsize=(8, 6))

ax2.scatter(edu_exp, y, color='#2E7D32', s=55, alpha=0.7, zorder=3, label='各都道府県')

for pref, (xi, yi) in {
    p: (edu_exp[i], y[i]) for i, p in enumerate(prefs) if p in highlights
}.items():
    ax2.scatter(xi, yi, color='#C62828', s=80, zorder=5)
    ax2.annotate(pref.replace('都', '').replace('府', '').replace('県', ''),
                 (xi, yi), textcoords='offset points', xytext=(5, 4),
                 fontsize=8.5, color='#C62828')

z2 = np.polyfit(edu_exp, y, 1)
xs2 = np.linspace(edu_exp.min() - 1000, edu_exp.max() + 1000, 200)
ax2.plot(xs2, np.poly1d(z2)(xs2), color='#2E7D32', linewidth=2, alpha=0.8,
         label=f'回帰直線')

r2, p2 = stats.pearsonr(edu_exp, y)
ax2.set_xlabel('教育費（二人以上の世帯）[円]', fontsize=11)
ax2.set_ylabel('大学進学率（%）', fontsize=11)
ax2.set_title(f'教育費と大学進学率\nr={r2:.3f}, p={p2:.4f}',
              fontsize=12, fontweight='bold')
ax2.legend(fontsize=9)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2024_H5_2_fig2_scatter_edu.png'), bbox_inches='tight', dpi=150)
plt.close(fig2)
print("図2保存: 2024_H5_2_fig2_scatter_edu.png")

# ────────────────────────────────────────────────────────────────
# 図3: 重回帰の回帰係数プロット（標準化係数）
# ────────────────────────────────────────────────────────────────
from sklearn.preprocessing import StandardScaler

fig3, ax3 = plt.subplots(figsize=(8, 5))

X2_mat = np.column_stack([ln_income, edu_exp, childcare, temperature, aging])
scaler = StandardScaler()
X2_std = scaler.fit_transform(X2_mat)
y_std = (y - y.mean()) / y.std()
model_std = sm.OLS(y_std, sm.add_constant(X2_std)).fit()

var_labels = ['ln(県民所得)', '教育費', '保育所充実度', '年平均気温', '高齢化率']
coefs = model_std.params[1:]
pvals = model_std.pvalues[1:]
ses = model_std.bse[1:]

y_pos = np.arange(len(var_labels))
cols = ['#C62828' if c > 0 else '#1565C0' for c in coefs]
ax3.barh(y_pos, coefs, color=cols, alpha=0.75, edgecolor='white', height=0.55)
ax3.errorbar(coefs, y_pos, xerr=1.96 * ses, fmt='none', color='#333', capsize=4, linewidth=1.2)
ax3.axvline(0, color='gray', linestyle='--', linewidth=1.0)
ax3.set_yticks(y_pos)
ax3.set_yticklabels(var_labels, fontsize=10)
ax3.set_xlabel('標準化回帰係数 β（±1.96SE）', fontsize=11)
ax3.set_title(f'重回帰モデルの標準化係数\nR²={model_multi.rsquared:.3f}',
              fontsize=12, fontweight='bold')
ax3.invert_yaxis()
ax3.grid(axis='x', alpha=0.3)

for i, (c, p) in enumerate(zip(coefs, pvals)):
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'n.s.'
    offset = 0.01 if c >= 0 else -0.01
    ha = 'left' if c >= 0 else 'right'
    ax3.text(c + offset, i, sig, va='center', ha=ha, fontsize=9, color='#333')

from matplotlib.patches import Patch
ax3.legend(handles=[Patch(color='#C62828', alpha=0.75, label='正の効果'),
                    Patch(color='#1565C0', alpha=0.75, label='負の効果')], fontsize=9)
plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2024_H5_2_fig3_coef.png'), bbox_inches='tight', dpi=150)
plt.close(fig3)
print("図3保存: 2024_H5_2_fig3_coef.png")

# ────────────────────────────────────────────────────────────────
# 図4: 地域別 大学進学率の棒グラフ
# ────────────────────────────────────────────────────────────────
fig4, ax4 = plt.subplots(figsize=(9, 5.5))

region_mean = df_ana.groupby('地域')['大学進学率'].mean().sort_values(ascending=False)
region_std  = df_ana.groupby('地域')['大学進学率'].std()
region_order = region_mean.index.tolist()

region_colors = {
    '関東': '#C62828', '近畿': '#E65100', '中部': '#1565C0',
    '北海道': '#2E7D32', '東北': '#6A1B9A', '中国': '#00838F',
    '四国': '#558B2F', '九州': '#4527A0',
}
bar_colors = [region_colors.get(r, '#888888') for r in region_order]

bars = ax4.bar(region_order, region_mean[region_order], color=bar_colors,
               alpha=0.78, edgecolor='white', width=0.6)
ax4.errorbar(range(len(region_order)), region_mean[region_order],
             yerr=region_std[region_order], fmt='none', color='#333', capsize=5, linewidth=1.3)

# 全国平均線
national_mean = y.mean()
ax4.axhline(national_mean, color='gray', linestyle='--', linewidth=1.5,
            label=f'全国平均 {national_mean:.1f}%')

for i, (r, v) in enumerate(zip(region_order, region_mean[region_order])):
    ax4.text(i, v + 0.5, f'{v:.1f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

ax4.set_ylabel('大学進学率（%）', fontsize=11)
ax4.set_title('地域別 大学進学率の比較（8地域区分）\n（SSDSE-B 2022, 棒：平均, エラーバー：SD）',
              fontsize=12, fontweight='bold')
ax4.legend(fontsize=9)
ax4.grid(axis='y', alpha=0.3)
ax4.tick_params(axis='x', labelsize=10)

plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2024_H5_2_fig4_region.png'), bbox_inches='tight', dpi=150)
plt.close(fig4)
print("図4保存: 2024_H5_2_fig4_region.png")

print("\n全図の生成完了（4枚）")