"""
2021_H4_katsuyo.py
統計活用奨励賞（高校生の部） 2021年度
「スポーツ施設・体育活動と医療費・健康指標の関係分析」
重回帰分析・相関分析・地域比較 ／ データ: SSDSE-B-2026.csv（2022年断面）
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2021_H4_katsuyo.py
# ============================================================


import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats

plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ─────────────────────────────────────────────
# データ読み込み
# ─────────────────────────────────────────────
df_b = pd.read_csv(DATA_B, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}', na=False)].copy()
df_b['年度'] = df_b['年度'].astype(int)

# 2022年断面
df = df_b[df_b['年度'] == 2022].copy().reset_index(drop=True)
assert len(df) == 47, f"47都道府県のはずが {len(df)} 行"

# ─────────────────────────────────────────────
# 変数定義
# ─────────────────────────────────────────────
# スポーツ・健康活動のプロキシ
df['教養娯楽費'] = df['教養娯楽費（二人以上の世帯）']          # 円/月
df['保健医療費'] = df['保健医療費（二人以上の世帯）']          # 円/月（目的変数）
df['高齢化率']   = df['65歳以上人口'] / df['総人口'] * 100     # %
df['病院密度']   = df['一般病院数'] / df['総人口'] * 100000    # 10万人あたり

# 相関分析用変数
vars_for_corr = {
    '教養娯楽費\n（スポーツ等代理）': '教養娯楽費',
    '保健医療費\n（医療費指標）':    '保健医療費',
    '高齢化率\n（％）':             '高齢化率',
    '病院密度\n（10万人あたり）':    '病院密度',
}

corr_labels = list(vars_for_corr.keys())
corr_cols   = list(vars_for_corr.values())

# 地域マップ
region_map = {
    '北海道': '北海道・東北', '青森県': '北海道・東北', '岩手県': '北海道・東北',
    '宮城県': '北海道・東北', '秋田県': '北海道・東北', '山形県': '北海道・東北',
    '福島県': '北海道・東北', '茨城県': '関東', '栃木県': '関東', '群馬県': '関東',
    '埼玉県': '関東', '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国・四国', '島根県': '中国・四国', '岡山県': '中国・四国',
    '広島県': '中国・四国', '山口県': '中国・四国', '徳島県': '中国・四国',
    '香川県': '中国・四国', '愛媛県': '中国・四国', '高知県': '中国・四国',
    '福岡県': '九州・沖縄', '佐賀県': '九州・沖縄', '長崎県': '九州・沖縄',
    '熊本県': '九州・沖縄', '大分県': '九州・沖縄', '宮崎県': '九州・沖縄',
    '鹿児島県': '九州・沖縄', '沖縄県': '九州・沖縄'
}
region_colors = {
    '北海道・東北': '#4e9af1', '関東': '#e05c5c', '中部': '#f0a500',
    '近畿': '#5cb85c', '中国・四国': '#9b59b6', '九州・沖縄': '#f39c12'
}

df['地域'] = df['都道府県'].map(region_map)
df['色']   = df['地域'].map(region_colors)

# ─────────────────────────────────────────────
# 重回帰分析（OLS）
# ─────────────────────────────────────────────
# 説明変数: 教養娯楽費、高齢化率、病院密度
# 目的変数: 保健医療費
X_cols = ['教養娯楽費', '高齢化率', '病院密度']
X_labels = ['教養娯楽費\n（スポーツ等代理）', '高齢化率（%）', '病院密度\n（10万人あたり）']
y_col  = '保健医療費'

df_reg = df[X_cols + [y_col]].dropna()
X_raw = df_reg[X_cols].values
y     = df_reg[y_col].values

# 標準化
X_mean = X_raw.mean(axis=0)
X_std  = X_raw.std(axis=0)
X_std_ = X_raw.std(axis=0, ddof=1)
y_std_ = y.std(ddof=1)
X_scaled = (X_raw - X_mean) / X_std

# OLS（標準化変数）
X_sm = sm.add_constant(X_scaled)
model = sm.OLS(y, X_sm).fit()
print(model.summary())

# 標準化偏回帰係数
std_coefs = model.params[1:]  # 定数項を除く
std_pvals = model.pvalues[1:]
print("\n標準化偏回帰係数:")
for lbl, coef, p in zip(X_labels, std_coefs, std_pvals):
    lbl2 = lbl.replace('\n', ' ')
    print(f"  {lbl2}: β={coef:.4f}, p={p:.4f}")

# 非標準化モデル（R²確認用）
X_sm_raw = sm.add_constant(X_raw)
model_raw = sm.OLS(y, X_sm_raw).fit()
print(f"\nR² = {model_raw.rsquared:.4f}")
print(f"調整済みR² = {model_raw.rsquared_adj:.4f}")

# ─────────────────────────────────────────────
# 図1: 散布図 教養娯楽費 vs 保健医療費
# ─────────────────────────────────────────────
fig1, ax1 = plt.subplots(figsize=(9, 7))

for region, color in region_colors.items():
    mask = df['地域'] == region
    ax1.scatter(
        df.loc[mask, '教養娯楽費'], df.loc[mask, '保健医療費'],
        c=color, s=60, alpha=0.85, label=region, zorder=3
    )

# 都道府県ラベル
for _, row in df.iterrows():
    name = row['都道府県'].replace('県', '').replace('府', '').replace('都', '').replace('道', '')
    ax1.annotate(
        name,
        (row['教養娯楽費'], row['保健医療費']),
        fontsize=6.5, ha='center', va='bottom',
        xytext=(0, 4), textcoords='offset points'
    )

# 回帰直線
x_vals = np.array([df['教養娯楽費'].min(), df['教養娯楽費'].max()])
slope, intercept, r, p, _ = stats.linregress(df['教養娯楽費'], df['保健医療費'])
ax1.plot(x_vals, slope * x_vals + intercept, 'k--', linewidth=1.5,
         label=f'回帰線 r={r:.2f} (p={p:.3f})')

ax1.set_xlabel('教養娯楽費（円/月）', fontsize=12)
ax1.set_ylabel('保健医療費（円/月）', fontsize=12)
ax1.set_title('図1：教養娯楽費（スポーツ等代理変数）と保健医療費の関係\n2022年 都道府県別', fontsize=13)
ax1.legend(fontsize=9, loc='upper left', framealpha=0.9)
ax1.grid(True, alpha=0.3)

fig1.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2021_H4_fig1.png'), bbox_inches='tight')
plt.close(fig1)
print("fig1 saved")

# ─────────────────────────────────────────────
# 図2: 相関ヒートマップ
# ─────────────────────────────────────────────
corr_data = df[corr_cols].copy()
corr_data.columns = corr_labels

corr_matrix = corr_data.corr()

fig2, ax2 = plt.subplots(figsize=(7, 6))
n = len(corr_cols)
im = ax2.imshow(corr_matrix.values, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')
plt.colorbar(im, ax=ax2, shrink=0.8, label='Pearson相関係数')

ax2.set_xticks(range(n))
ax2.set_yticks(range(n))
ax2.set_xticklabels(corr_labels, fontsize=9)
ax2.set_yticklabels(corr_labels, fontsize=9)

for i in range(n):
    for j in range(n):
        val = corr_matrix.values[i, j]
        # p値計算
        if i != j:
            _, pval = stats.pearsonr(corr_data.iloc[:, i].dropna(), corr_data.iloc[:, j].dropna())
            star = '**' if pval < 0.01 else ('*' if pval < 0.05 else '')
        else:
            star = ''
        text_color = 'white' if abs(val) > 0.5 else 'black'
        ax2.text(j, i, f'{val:.2f}{star}', ha='center', va='center',
                 fontsize=10, color=text_color, fontweight='bold')

ax2.set_title('図2：健康・スポーツ関連変数のPearson相関行列\n2022年 都道府県別（*p<0.05, **p<0.01）', fontsize=12)
fig2.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2021_H4_fig2.png'), bbox_inches='tight')
plt.close(fig2)
print("fig2 saved")

# ─────────────────────────────────────────────
# 図3: 標準化偏回帰係数プロット
# ─────────────────────────────────────────────
fig3, ax3 = plt.subplots(figsize=(8, 5))

colors_coef = ['#e05c5c' if p < 0.05 else '#aaaaaa' for p in std_pvals]
y_pos = range(len(X_labels))

bars = ax3.barh(list(y_pos), std_coefs, color=colors_coef, height=0.5, edgecolor='white')

for bar, val, p in zip(bars, std_coefs, std_pvals):
    label = f'β={val:.3f}'
    if p < 0.01:
        label += ' **'
    elif p < 0.05:
        label += ' *'
    else:
        label += f' (n.s.)'
    x_off = 0.01 if val >= 0 else -0.01
    ha = 'left' if val >= 0 else 'right'
    ax3.text(val + x_off, bar.get_y() + bar.get_height() / 2,
             label, va='center', ha=ha, fontsize=10)

ax3.axvline(0, color='black', linewidth=0.8, linestyle='-')
ax3.set_yticks(list(y_pos))
ax3.set_yticklabels(X_labels, fontsize=10)
ax3.set_xlabel('標準化偏回帰係数 β', fontsize=11)
ax3.set_title(
    f'図3：保健医療費に対する標準化偏回帰係数\n'
    f'2022年 都道府県別 OLS（R²={model_raw.rsquared:.3f}, adj.R²={model_raw.rsquared_adj:.3f}）',
    fontsize=12
)
ax3.set_xlim(min(std_coefs) - 0.25, max(std_coefs) + 0.25)

# 凡例
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#e05c5c', label='有意 (p<0.05)'),
    Patch(facecolor='#aaaaaa', label='非有意 (p≥0.05)')
]
ax3.legend(handles=legend_elements, fontsize=9, loc='lower right')
ax3.grid(True, axis='x', alpha=0.3)

fig3.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2021_H4_fig3.png'), bbox_inches='tight')
plt.close(fig3)
print("fig3 saved")

# ─────────────────────────────────────────────
# 図4: 都道府県別保健医療費ランキング（上位・下位各10）
# ─────────────────────────────────────────────
df_sorted = df[['都道府県', '保健医療費', '地域', '色']].sort_values('保健医療費', ascending=False)

top10  = df_sorted.head(10)
bot10  = df_sorted.tail(10)
plot_df = pd.concat([top10, bot10])

fig4, ax4 = plt.subplots(figsize=(10, 7))

short_names = [
    s.replace('県', '').replace('府', '').replace('都', '').replace('道', '')
    for s in plot_df['都道府県']
]

bars = ax4.bar(range(len(plot_df)), plot_df['保健医療費'].values,
               color=plot_df['色'].values, edgecolor='white', linewidth=0.5)

ax4.set_xticks(range(len(plot_df)))
ax4.set_xticklabels(short_names, rotation=45, ha='right', fontsize=9)
ax4.set_ylabel('保健医療費（円/月、二人以上の世帯）', fontsize=11)
ax4.set_title('図4：都道府県別保健医療費ランキング\n上位10都道府県・下位10都道府県（2022年）', fontsize=12)

# 上位/下位区分線
ax4.axvline(9.5, color='gray', linestyle='--', linewidth=1.5, alpha=0.7)
ax4.text(4.7, ax4.get_ylim()[1] * 0.97, '高い（上位10）', ha='center', fontsize=9, color='gray')
ax4.text(14.5, ax4.get_ylim()[1] * 0.97, '低い（下位10）', ha='center', fontsize=9, color='gray')

# 地域凡例
handles = [Patch(facecolor=c, label=r) for r, c in region_colors.items()]
ax4.legend(handles=handles, fontsize=8, loc='upper right', framealpha=0.9, ncol=2)
ax4.grid(True, axis='y', alpha=0.3)

# 値ラベル
for bar in bars:
    h = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width() / 2, h + 100,
             f'{int(h):,}', ha='center', va='bottom', fontsize=7)

fig4.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2021_H4_fig4.png'), bbox_inches='tight')
plt.close(fig4)
print("fig4 saved")

# ─────────────────────────────────────────────
# 統計サマリー出力（HTML作成用）
# ─────────────────────────────────────────────
print("\n===== 統計サマリー =====")
print(f"分析対象: 2022年 47都道府県")
print(f"保健医療費 平均: {df['保健医療費'].mean():.0f} 円/月")
print(f"保健医療費 最大: {df.loc[df['保健医療費'].idxmax(), '都道府県']} ({df['保健医療費'].max():.0f})")
print(f"保健医療費 最小: {df.loc[df['保健医療費'].idxmin(), '都道府県']} ({df['保健医療費'].min():.0f})")
print(f"教養娯楽費 平均: {df['教養娯楽費'].mean():.0f} 円/月")
print(f"高齢化率 平均: {df['高齢化率'].mean():.1f}%")
r_kyo, p_kyo = stats.pearsonr(df['教養娯楽費'], df['保健医療費'])
print(f"教養娯楽費-保健医療費 相関: r={r_kyo:.3f}, p={p_kyo:.4f}")
print(f"重回帰 R²: {model_raw.rsquared:.3f}")
print(f"重回帰 調整済みR²: {model_raw.rsquared_adj:.3f}")
print(f"教養娯楽費 β: {std_coefs[0]:.3f}, p={std_pvals[0]:.4f}")
print(f"高齢化率 β: {std_coefs[1]:.3f}, p={std_pvals[1]:.4f}")
print(f"病院密度 β: {std_coefs[2]:.3f}, p={std_pvals[2]:.4f}")

top_ken  = df.nlargest(5, '保健医療費')[['都道府県', '保健医療費', '地域']]
bot_ken  = df.nsmallest(5, '保健医療費')[['都道府県', '保健医療費', '地域']]
print("\n保健医療費 上位5:")
print(top_ken.to_string(index=False))
print("\n保健医療費 下位5:")
print(bot_ken.to_string(index=False))

# 地域別平均
reg_mean = df.groupby('地域')['保健医療費'].mean().sort_values(ascending=False)
print("\n地域別保健医療費 平均:")
print(reg_mean.to_string())