"""
2021年度 統計データ分析コンペティション 審査員奨励賞（高校生の部）
タイトル: 高齢者介護・福祉と地域格差：介護施設・老人福祉費の分析
手法: 重回帰分析、相関分析、地域比較
データ: SSDSE-B-2026.csv（都道府県別）
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2021_H5_3_shorei.py
# ============================================================


import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats

plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ── データ読み込み ─────────────────────────────────────────────────────────────
df_b = pd.read_csv(DATA_B, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}', na=False)].copy()
df_b['年度'] = df_b['年度'].astype(int)

# ── 地域マップ ─────────────────────────────────────────────────────────────────
region_map = {
    '北海道': '北海道・東北', '青森県': '北海道・東北', '岩手県': '北海道・東北',
    '宮城県': '北海道・東北', '秋田県': '北海道・東北', '山形県': '北海道・東北',
    '福島県': '北海道・東北', '茨城県': '関東', '栃木県': '関東', '群馬県': '関東',
    '埼玉県': '関東', '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国・四国', '島根県': '中国・四国', '岡山県': '中国・四国',
    '広島県': '中国・四国', '山口県': '中国・四国', '徳島県': '中国・四国',
    '香川県': '中国・四国', '愛媛県': '中国・四国', '高知県': '中国・四国',
    '福岡県': '九州・沖縄', '佐賀県': '九州・沖縄', '長崎県': '九州・沖縄',
    '熊本県': '九州・沖縄', '大分県': '九州・沖縄', '宮崎県': '九州・沖縄',
    '鹿児島県': '九州・沖縄', '沖縄県': '九州・沖縄'
}
region_colors = {
    '北海道・東北': '#4e9af1', '関東': '#e05c5c', '中部': '#f0a500',
    '近畿': '#5cb85c', '中国・四国': '#9b59b6', '九州・沖縄': '#f39c12'
}

# ── 指標作成 ──────────────────────────────────────────────────────────────────
df_b['高齢化率'] = df_b['65歳以上人口'] / df_b['総人口'] * 100
df_b['保健医療費_人'] = df_b['保健医療費（二人以上の世帯）']          # 円/月
df_b['保育所数_万人'] = df_b['保育所等数'] / df_b['総人口'] * 10000
df_b['病院数_万人']  = df_b['一般病院数']  / df_b['総人口'] * 10000
df_b['消費支出']     = df_b['消費支出（二人以上の世帯）']
df_b['転入超過率']   = (df_b['転入者数（日本人移動者）'] - df_b['転出者数（日本人移動者）']) \
                       / df_b['総人口'] * 1000
df_b['地域'] = df_b['都道府県'].map(region_map)

# 2022年断面
df22 = df_b[df_b['年度'] == 2022].copy()

print("=" * 60)
print("2022年 断面データ（47都道府県）")
print(f"  N = {len(df22)}")
print(f"  高齢化率: 平均 {df22['高齢化率'].mean():.1f}%  "
      f"（{df22['高齢化率'].min():.1f}〜{df22['高齢化率'].max():.1f}%）")
print(f"  保健医療費: 平均 {df22['保健医療費_人'].mean():.0f}円/月  "
      f"（{df22['保健医療費_人'].min():.0f}〜{df22['保健医療費_人'].max():.0f}円/月）")
print()

# ── OLS 重回帰（標準化偏回帰係数） ────────────────────────────────────────────
y_col  = '保健医療費_人'
x_cols = ['高齢化率', '消費支出', '保育所数_万人', '病院数_万人', '転入超過率']
x_labels = {
    '高齢化率':    '高齢化率（%）',
    '消費支出':    '消費支出（円/月）',
    '保育所数_万人': '保育所数（万人当たり）',
    '病院数_万人':  '病院数（万人当たり）',
    '転入超過率':  '転入超過率（‰）',
}

d_reg = df22[[y_col] + x_cols].dropna()
print(f"回帰分析 N = {len(d_reg)}")

# 標準化
d_std = (d_reg - d_reg.mean()) / d_reg.std()
X_sm = sm.add_constant(d_std[x_cols])
result = sm.OLS(d_std[y_col], X_sm).fit()
print(result.summary())

coef_df = pd.DataFrame({
    '変数': x_cols,
    '標準化偏回帰係数': result.params[1:].values,
    '標準誤差': result.bse[1:].values,
    'p値': result.pvalues[1:].values,
})
coef_df['ラベル'] = coef_df['変数'].map(x_labels)
print()
print("標準化偏回帰係数:")
print(coef_df[['ラベル', '標準化偏回帰係数', 'p値']].to_string(index=False))
print()
print(f"R²={result.rsquared:.3f}  adj.R²={result.rsquared_adj:.3f}  "
      f"F({result.df_model:.0f},{result.df_resid:.0f})={result.fvalue:.2f}  "
      f"p={result.f_pvalue:.2e}")

# ── 相関係数（参考） ──────────────────────────────────────────────────────────
print()
print("保健医療費との相関:")
for v in x_cols:
    r, p = stats.pearsonr(df22[v].dropna(), df22[y_col][df22[v].notna()])
    print(f"  {x_labels[v]}: r={r:.3f}  p={p:.4f}")

# ── 地域別平均（高齢化率・保健医療費） ────────────────────────────────────────
region_stats = df22.groupby('地域')[['高齢化率', '保健医療費_人']].mean().reset_index()
print()
print("地域別平均:")
print(region_stats.to_string(index=False))

# ── ランキング（保健医療費/高齢者人口） ───────────────────────────────────────
df22_sorted = df22.sort_values('保健医療費_人', ascending=False)
print()
print("保健医療費 上位5都道府県:")
print(df22_sorted[['都道府県', '保健医療費_人', '高齢化率']].head(5).to_string(index=False))
print("保健医療費 下位5都道府県:")
print(df22_sorted[['都道府県', '保健医療費_人', '高齢化率']].tail(5).to_string(index=False))

# ══════════════════════════════════════════════════════════════
# 図1: 地域別高齢化率の時系列推移（2012〜2023）
# ══════════════════════════════════════════════════════════════
df_ts = df_b[df_b['年度'].between(2012, 2023)].copy()
df_ts_region = df_ts.groupby(['年度', '地域'])['高齢化率'].mean().reset_index()

region_order = ['北海道・東北', '関東', '中部', '近畿', '中国・四国', '九州・沖縄']

fig1, ax1 = plt.subplots(figsize=(9, 5))
for region in region_order:
    d = df_ts_region[df_ts_region['地域'] == region].sort_values('年度')
    ax1.plot(d['年度'], d['高齢化率'],
             color=region_colors[region], marker='o', markersize=4,
             linewidth=2, label=region)

ax1.set_title('図1: 地域別 高齢化率の推移（2012〜2023年）', fontsize=13, fontweight='bold', pad=12)
ax1.set_xlabel('年度', fontsize=11)
ax1.set_ylabel('高齢化率（%）', fontsize=11)
ax1.legend(loc='upper left', fontsize=9, framealpha=0.9)
ax1.set_xticks(range(2012, 2024))
ax1.set_xticklabels([str(y) for y in range(2012, 2024)], rotation=45, fontsize=8)
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}%'))
ax1.grid(axis='y', alpha=0.35, linestyle='--')
ax1.grid(axis='x', alpha=0.2, linestyle=':')
ax1.set_facecolor('#FAFAFA')

# 2022年の国平均を強調
y2022_nat = df_ts[df_ts['年度'] == 2022]['高齢化率'].mean()
ax1.axhline(y2022_nat, color='black', linestyle=':', alpha=0.5, linewidth=1)
ax1.text(2023.05, y2022_nat, f'全国平均\n{y2022_nat:.1f}%', fontsize=8, va='center')

plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2021_H5_3_fig1.png'), bbox_inches='tight')
plt.close(fig1)
print("\n図1 保存完了")

# ══════════════════════════════════════════════════════════════
# 図2: 高齢化率 vs 保健医療費（散布図）
# ══════════════════════════════════════════════════════════════
fig2, ax2 = plt.subplots(figsize=(9, 6.5))

for _, row in df22.iterrows():
    region = row['地域']
    col = region_colors.get(region, '#888888')
    ax2.scatter(row['高齢化率'], row['保健医療費_人'],
                color=col, s=55, alpha=0.85, zorder=3,
                edgecolors='white', linewidths=0.5)
    # 都道府県ラベル（短縮）
    name = row['都道府県'].replace('県', '').replace('都', '').replace('道', '').replace('府', '')
    ax2.annotate(name,
                 xy=(row['高齢化率'], row['保健医療費_人']),
                 fontsize=6.5, ha='center', va='bottom',
                 xytext=(0, 3), textcoords='offset points', color='#333')

# 回帰直線
x_reg = df22['高齢化率'].dropna()
y_reg = df22['保健医療費_人'][x_reg.index]
slope, intercept, r_val, p_val, _ = stats.linregress(x_reg, y_reg)
x_line = np.linspace(x_reg.min(), x_reg.max(), 100)
ax2.plot(x_line, slope * x_line + intercept, 'k--', linewidth=1.5, alpha=0.7, label=f'回帰直線 (r={r_val:.3f}, p={p_val:.3f})')

# 凡例（地域）
handles = [plt.Line2D([0], [0], marker='o', color='w',
           markerfacecolor=region_colors[r], markersize=8, label=r)
           for r in region_order]
ax2.legend(handles=handles, loc='upper right', fontsize=8.5, framealpha=0.9,
           title='地域区分', title_fontsize=8)

ax2.set_title('図2: 高齢化率と保健医療費の関係（2022年・47都道府県）',
              fontsize=12, fontweight='bold', pad=12)
ax2.set_xlabel('高齢化率（%）', fontsize=11)
ax2.set_ylabel('保健医療費（円/月・二人以上世帯）', fontsize=11)
ax2.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}%'))
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:,.0f}'))
ax2.grid(alpha=0.3, linestyle='--')
ax2.set_facecolor('#FAFAFA')

plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2021_H5_3_fig2.png'), bbox_inches='tight')
plt.close(fig2)
print("図2 保存完了")

# ══════════════════════════════════════════════════════════════
# 図3: 標準化偏回帰係数（横棒グラフ）
# ══════════════════════════════════════════════════════════════
fig3, ax3 = plt.subplots(figsize=(8, 5))

coef_sorted = coef_df.sort_values('標準化偏回帰係数')
colors3 = ['#e05c5c' if v < 0 else '#4e9af1' for v in coef_sorted['標準化偏回帰係数']]
bars = ax3.barh(coef_sorted['ラベル'], coef_sorted['標準化偏回帰係数'],
                color=colors3, alpha=0.85, edgecolor='white', linewidth=0.5)

# 誤差バー
ax3.errorbar(coef_sorted['標準化偏回帰係数'], range(len(coef_sorted)),
             xerr=1.96 * coef_sorted['標準誤差'],
             fmt='none', color='black', capsize=4, linewidth=1.2)

# p値ラベル
for i, (_, row) in enumerate(coef_sorted.iterrows()):
    sig_mark = '***' if row['p値'] < 0.001 else ('**' if row['p値'] < 0.01 else ('*' if row['p値'] < 0.05 else 'n.s.'))
    x_pos = row['標準化偏回帰係数'] + (0.02 if row['標準化偏回帰係数'] >= 0 else -0.02)
    ha = 'left' if row['標準化偏回帰係数'] >= 0 else 'right'
    ax3.text(x_pos, i, f"p={row['p値']:.3f} {sig_mark}", va='center', ha=ha, fontsize=9, color='#333')

ax3.axvline(0, color='black', linewidth=0.8)
ax3.set_title(f'図3: 標準化偏回帰係数（目的変数: 保健医療費）\nR²={result.rsquared:.3f}, adj.R²={result.rsquared_adj:.3f}, N=47',
              fontsize=11, fontweight='bold', pad=10)
ax3.set_xlabel('標準化偏回帰係数 (β)', fontsize=11)
ax3.grid(axis='x', alpha=0.35, linestyle='--')
ax3.set_facecolor('#FAFAFA')

# 凡例
from matplotlib.patches import Patch
legend_els = [Patch(facecolor='#4e9af1', label='正の効果（保健医療費を増加）'),
              Patch(facecolor='#e05c5c', label='負の効果（保健医療費を減少）')]
ax3.legend(handles=legend_els, loc='lower right', fontsize=8.5)

plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2021_H5_3_fig3.png'), bbox_inches='tight')
plt.close(fig3)
print("図3 保存完了")

# ══════════════════════════════════════════════════════════════
# 図4: 都道府県別 保健医療費ランキング（棒グラフ）
# ══════════════════════════════════════════════════════════════
df_rank = df22.sort_values('保健医療費_人', ascending=True).copy()
colors4 = [region_colors.get(r, '#888888') for r in df_rank['地域']]

fig4, ax4 = plt.subplots(figsize=(10, 11))
bars4 = ax4.barh(df_rank['都道府県'], df_rank['保健医療費_人'],
                 color=colors4, alpha=0.85, edgecolor='white', linewidth=0.4)

# 全国平均線
nat_avg = df22['保健医療費_人'].mean()
ax4.axvline(nat_avg, color='black', linestyle='--', linewidth=1.5, alpha=0.7)
ax4.text(nat_avg + 100, len(df_rank) - 1, f'全国平均\n{nat_avg:,.0f}円',
         va='top', ha='left', fontsize=8.5, color='#333')

# 値ラベル
for bar, val in zip(bars4, df_rank['保健医療費_人']):
    ax4.text(val + 50, bar.get_y() + bar.get_height() / 2,
             f'{val:,.0f}', va='center', ha='left', fontsize=7.5)

# 凡例（地域）
handles4 = [plt.Line2D([0], [0], marker='s', color='w',
            markerfacecolor=region_colors[r], markersize=10, label=r)
            for r in region_order]
ax4.legend(handles=handles4, loc='lower right', fontsize=8.5, framealpha=0.9,
           title='地域区分', title_fontsize=8)

ax4.set_title('図4: 都道府県別 保健医療費（2022年・二人以上世帯）\n地域別の格差に注目', fontsize=12, fontweight='bold', pad=12)
ax4.set_xlabel('保健医療費（円/月）', fontsize=11)
ax4.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:,.0f}'))
ax4.grid(axis='x', alpha=0.35, linestyle='--')
ax4.set_facecolor('#FAFAFA')
ax4.tick_params(axis='y', labelsize=9)

plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2021_H5_3_fig4.png'), bbox_inches='tight')
plt.close(fig4)
print("図4 保存完了")

print()
print("=" * 60)
print("全図の生成が完了しました。")
print(f"出力先: {os.path.abspath(FIG_DIR)}")

# ── HTML用の統計値を出力 ──────────────────────────────────────────────────────
print()
print("=== HTML記載用 統計値 ===")
print(f"高齢化率 全国平均 (2022): {df22['高齢化率'].mean():.1f}%")
print(f"高齢化率 最高: {df22.loc[df22['高齢化率'].idxmax(), '都道府県']} {df22['高齢化率'].max():.1f}%")
print(f"高齢化率 最低: {df22.loc[df22['高齢化率'].idxmin(), '都道府県']} {df22['高齢化率'].min():.1f}%")
print(f"保健医療費 全国平均 (2022): {df22['保健医療費_人'].mean():,.0f}円/月")
print(f"保健医療費 最高: {df22.loc[df22['保健医療費_人'].idxmax(), '都道府県']} {df22['保健医療費_人'].max():,.0f}円")
print(f"保健医療費 最低: {df22.loc[df22['保健医療費_人'].idxmin(), '都道府県']} {df22['保健医療費_人'].min():,.0f}円")
print(f"保健医療費 格差倍率: {df22['保健医療費_人'].max() / df22['保健医療費_人'].min():.1f}倍")
print(f"高齢化率×保健医療費 相関: r={stats.pearsonr(df22['高齢化率'], df22['保健医療費_人'])[0]:.3f}")
print(f"OLS R²={result.rsquared:.3f}  adj.R²={result.rsquared_adj:.3f}")
print()

# 地域別 高齢化率 2022
for region in region_order:
    val = df22[df22['地域'] == region]['高齢化率'].mean()
    fee = df22[df22['地域'] == region]['保健医療費_人'].mean()
    print(f"  {region}: 高齢化率 {val:.1f}%  保健医療費 {fee:,.0f}円")

# 高齢化率推移
print()
print("高齢化率 全国平均推移:")
for yr in [2012, 2015, 2018, 2022, 2023]:
    d = df_b[df_b['年度'] == yr]
    if len(d) > 0:
        print(f"  {yr}年: {d['高齢化率'].mean():.1f}%")