"""
2019_H1_daijin.py
都道府県別医療資源の偏在と健康格差：医師数・病床数の地域不均衡分析
（総務大臣賞 高校生部門 2019年度）
実データ使用: SSDSE-B-2026.csv (2019年度データ)
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2019_H1_daijin.py
# ============================================================


import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import statsmodels.api as sm
from scipy import stats

plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ── データ読み込み ──────────────────────────────────────────────────────────────
df_b = pd.read_csv(DATA_B, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}', na=False)].copy()
df_b['年度'] = df_b['年度'].astype(int)

print("df_b columns:", df_b.columns.tolist())
print("Available years:", sorted(df_b['年度'].unique()))

# 2019年度データを使用
df = df_b[df_b['年度'] == 2019].copy().reset_index(drop=True)
print(f"\n2019年度 都道府県数: {len(df)}")

# ── 派生変数の計算 ─────────────────────────────────────────────────────────────
df['人口万人']        = df['総人口'] / 10000
df['病院数per万人']   = df['一般病院数'] / df['人口万人']
df['診療所数per万人'] = df['一般診療所数'] / df['人口万人']
df['死亡率per千人']   = df['死亡数'] / df['総人口'] * 1000
df['高齢化率']        = df['65歳以上人口'] / df['総人口'] * 100
df['保健医療費']      = df['保健医療費（二人以上の世帯）']
df['log人口']         = np.log(df['総人口'])

# 地域分類
region_map = {
    '北海道': '北海道',
    '青森県': '東北', '岩手県': '東北', '宮城県': '東北',
    '秋田県': '東北', '山形県': '東北', '福島県': '東北',
    '茨城県': '関東', '栃木県': '関東', '群馬県': '関東',
    '埼玉県': '関東', '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国', '島根県': '中国', '岡山県': '中国', '広島県': '中国', '山口県': '中国',
    '徳島県': '四国', '香川県': '四国', '愛媛県': '四国', '高知県': '四国',
    '福岡県': '九州', '佐賀県': '九州', '長崎県': '九州', '熊本県': '九州',
    '大分県': '九州', '宮崎県': '九州', '鹿児島県': '九州', '沖縄県': '九州',
}
region_colors = {
    '北海道': '#1f77b4',
    '東北':   '#ff7f0e',
    '関東':   '#2ca02c',
    '中部':   '#d62728',
    '近畿':   '#9467bd',
    '中国':   '#8c564b',
    '四国':   '#e377c2',
    '九州':   '#bcbd22',
}
df['地域'] = df['都道府県'].map(region_map)
df['色']   = df['地域'].map(region_colors)

# ── Gini係数計算関数 ───────────────────────────────────────────────────────────
def gini(values):
    """Gini係数を計算する（0=完全平等, 1=完全不平等）"""
    v = np.sort(np.array(values, dtype=float))
    n = len(v)
    cumv = np.cumsum(v)
    return (n + 1 - 2 * np.sum(cumv) / cumv[-1]) / n

def lorenz_curve(values):
    """Lorenz曲線の座標を返す"""
    v = np.sort(np.array(values, dtype=float))
    n = len(v)
    cum_pop  = np.concatenate([[0], np.arange(1, n + 1) / n])
    cum_val  = np.concatenate([[0], np.cumsum(v) / v.sum()])
    return cum_pop, cum_val

# ── Gini係数の計算 ────────────────────────────────────────────────────────────
g_hospital = gini(df['病院数per万人'].values)
g_clinic   = gini(df['診療所数per万人'].values)
print(f"\nGini係数（病院数/万人）:   {g_hospital:.4f}")
print(f"Gini係数（診療所数/万人）: {g_clinic:.4f}")

# ── 上位・下位5都道府県 ────────────────────────────────────────────────────────
top5    = df.nlargest(5, '病院数per万人')[['都道府県', '病院数per万人', '死亡率per千人', '高齢化率']]
bottom5 = df.nsmallest(5, '病院数per万人')[['都道府県', '病院数per万人', '死亡率per千人', '高齢化率']]
print("\n病院数/万人 上位5:")
print(top5.to_string(index=False))
print("\n病院数/万人 下位5:")
print(bottom5.to_string(index=False))

# ══════════════════════════════════════════════════════════════════════
# 図1: 都道府県別病院数/万人 横棒グラフ
# ══════════════════════════════════════════════════════════════════════
fig, ax = plt.subplots(figsize=(10, 13))

df_sorted = df.sort_values('病院数per万人', ascending=True).reset_index(drop=True)
bars = ax.barh(
    df_sorted['都道府県'],
    df_sorted['病院数per万人'],
    color=df_sorted['色'],
    edgecolor='white', linewidth=0.5, height=0.75
)

# 全国平均線
nat_avg = df['病院数per万人'].mean()
ax.axvline(nat_avg, color='#333333', linewidth=1.8, linestyle='--', label=f'全国平均: {nat_avg:.3f}件/万人')

ax.set_xlabel('一般病院数（人口10,000人あたり）', fontsize=12)
ax.set_title('都道府県別 一般病院数（人口万人あたり）\n2019年度', fontsize=14, fontweight='bold', pad=15)
ax.set_xlim(0, df_sorted['病院数per万人'].max() * 1.15)
ax.grid(axis='x', alpha=0.3, linestyle=':')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# 凡例（地域）
handles = [mpatches.Patch(color=c, label=r) for r, c in region_colors.items()]
handles.append(plt.Line2D([0], [0], color='#333333', linewidth=1.8, linestyle='--', label=f'全国平均: {nat_avg:.3f}'))
ax.legend(handles=handles, loc='lower right', fontsize=9, framealpha=0.8, ncol=2)

# 値ラベル（右端に）
for bar, val in zip(bars, df_sorted['病院数per万人']):
    ax.text(val + 0.01, bar.get_y() + bar.get_height() / 2,
            f'{val:.2f}', va='center', ha='left', fontsize=7.5)

fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, '2019_H1_fig1.png'), bbox_inches='tight')
plt.close(fig)
print("\n図1 保存完了: 2019_H1_fig1.png")

# ══════════════════════════════════════════════════════════════════════
# 図2: Lorenz曲線（医療資源の地域不均衡）
# ══════════════════════════════════════════════════════════════════════
fig, axes = plt.subplots(1, 2, figsize=(12, 5.5))

for ax, col, label, color, g in [
    (axes[0], '病院数per万人',   '一般病院数', '#1f77b4', g_hospital),
    (axes[1], '診療所数per万人', '一般診療所数', '#e377c2', g_clinic),
]:
    cum_pop, cum_val = lorenz_curve(df[col].values)
    ax.plot([0, 1], [0, 1], 'k--', linewidth=1.5, label='完全平等線', zorder=1)
    ax.fill_between(cum_pop, cum_pop, cum_val, alpha=0.18, color=color)
    ax.plot(cum_pop, cum_val, '-', color=color, linewidth=2.5,
            label=f'Lorenz曲線\nGini係数 = {g:.4f}', zorder=2)
    ax.set_xlabel('累積人口割合', fontsize=11)
    ax.set_ylabel('累積医療資源割合', fontsize=11)
    ax.set_title(f'{label}のLorenz曲線（都道府県間格差）', fontsize=12, fontweight='bold')
    ax.legend(fontsize=10, loc='upper left')
    ax.set_xlim(0, 1); ax.set_ylim(0, 1)
    ax.grid(alpha=0.3, linestyle=':')
    ax.text(0.65, 0.08, f'Gini = {g:.4f}', fontsize=13, fontweight='bold',
            color=color, transform=ax.transAxes)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

fig.suptitle('医療資源の地域格差：Lorenz曲線分析（2019年度）',
             fontsize=13, fontweight='bold', y=1.02)
fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, '2019_H1_fig2.png'), bbox_inches='tight')
plt.close(fig)
print("図2 保存完了: 2019_H1_fig2.png")

# ══════════════════════════════════════════════════════════════════════
# 図3: 医療資源密度 vs 死亡率 散布図
# ══════════════════════════════════════════════════════════════════════
fig, ax = plt.subplots(figsize=(9, 7))

x = df['病院数per万人'].values
y = df['死亡率per千人'].values

# 地域ごとに散布
for region, color in region_colors.items():
    mask = df['地域'] == region
    ax.scatter(x[mask], y[mask], c=color, s=60, zorder=3, label=region, edgecolors='white', linewidth=0.5)

# 都道府県ラベル
for _, row in df.iterrows():
    ax.annotate(
        row['都道府県'].replace('県', '').replace('府', '').replace('都', '').replace('道', ''),
        (row['病院数per万人'], row['死亡率per千人']),
        fontsize=7, ha='center', va='bottom', color='#333333',
        xytext=(0, 3), textcoords='offset points'
    )

# 回帰直線
slope, intercept, r, p, se = stats.linregress(x, y)
xline = np.linspace(x.min(), x.max(), 100)
ax.plot(xline, intercept + slope * xline, 'r-', linewidth=2,
        label=f'回帰直線\nr = {r:.3f}, p = {p:.4f}', zorder=2)

ax.set_xlabel('一般病院数（人口万人あたり）', fontsize=12)
ax.set_ylabel('死亡率（人口千人あたり）', fontsize=12)
ax.set_title('医療資源密度と死亡率の関係\n都道府県別（2019年度）', fontsize=13, fontweight='bold')
ax.legend(fontsize=9, loc='upper left', framealpha=0.85)
ax.grid(alpha=0.25, linestyle=':')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# 注釈: Pearson r
ax.text(0.97, 0.07,
        f'Pearson r = {r:.3f}\np = {p:.4f}',
        transform=ax.transAxes, fontsize=10, ha='right',
        bbox=dict(boxstyle='round,pad=0.4', facecolor='#fff3cd', edgecolor='#f0ad4e'))

print(f"\n【散布図】病院数/万人 vs 死亡率")
print(f"  Pearson r = {r:.4f}, p = {p:.4f}")
print(f"  回帰式: y = {slope:.4f}x + {intercept:.4f}")

fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, '2019_H1_fig3.png'), bbox_inches='tight')
plt.close(fig)
print("図3 保存完了: 2019_H1_fig3.png")

# ══════════════════════════════════════════════════════════════════════
# 図4: 重回帰分析 — 標準化偏回帰係数
# ══════════════════════════════════════════════════════════════════════

# 説明変数: 病院数密度, 診療所数密度, 高齢化率, log人口, 保健医療費
X_vars = {
    '病院数\n（/万人）':   df['病院数per万人'].values,
    '診療所数\n（/万人）': df['診療所数per万人'].values,
    '高齢化率\n（%）':     df['高齢化率'].values,
    'log人口':             df['log人口'].values,
    '保健医療費\n（円）':  df['保健医療費'].values,
}
y_reg = df['死亡率per千人'].values

# 標準化
def standardize(arr):
    return (arr - arr.mean()) / arr.std()

X_std_list = [standardize(v) for v in X_vars.values()]
y_std = standardize(y_reg)
X_std_arr = np.column_stack(X_std_list)
X_const = sm.add_constant(X_std_arr)

model = sm.OLS(y_std, X_const).fit()
print("\n【重回帰分析結果】")
print(model.summary())

coefs  = np.array(model.params[1:], dtype=float)   # 定数項を除く
ci_all = model.conf_int()
ci     = np.array(ci_all[1:], dtype=float)        # (n, 2) numpy array
pvals  = np.array(model.pvalues[1:], dtype=float)
labels = list(X_vars.keys())

# 係数を昇順にソート（横棒グラフ用）
order    = np.argsort(coefs)
labels_o = [labels[i] for i in order]
coefs_o  = coefs[order]
ci_lo_o  = ci[order, 0]
ci_hi_o  = ci[order, 1]
pvals_o  = pvals[order]

# 有意マーク
sig_marks = ['***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else '' for p in pvals_o]

bar_colors = ['#d62728' if c < 0 else '#1f77b4' for c in coefs_o]

fig, ax = plt.subplots(figsize=(8, 5.5))
ypos = np.arange(len(labels_o))

bars = ax.barh(ypos, coefs_o, color=bar_colors, alpha=0.85,
               edgecolor='white', height=0.55, zorder=3)
ax.errorbar(coefs_o, ypos,
            xerr=[coefs_o - ci_lo_o, ci_hi_o - coefs_o],
            fmt='none', ecolor='#333333', elinewidth=1.5, capsize=4, zorder=4)

ax.axvline(0, color='#333333', linewidth=1.2, zorder=2)
ax.set_yticks(ypos)
ax.set_yticklabels(labels_o, fontsize=10)
ax.set_xlabel('標準化偏回帰係数（β）', fontsize=11)
ax.set_title('重回帰分析：死亡率への影響要因\n標準化偏回帰係数と95%信頼区間（2019年度）',
             fontsize=12, fontweight='bold')
ax.grid(axis='x', alpha=0.3, linestyle=':')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# 有意マーク & 係数値
for i, (c, mark, p) in enumerate(zip(coefs_o, sig_marks, pvals_o)):
    txt = f'{c:+.3f}{mark}'
    ax.text(c + (0.02 if c >= 0 else -0.02),
            i, txt, va='center',
            ha='left' if c >= 0 else 'right',
            fontsize=9.5, fontweight='bold')

# R²
ax.text(0.98, 0.05, f'R² = {model.rsquared:.3f}\nadj.R² = {model.rsquared_adj:.3f}',
        transform=ax.transAxes, fontsize=9.5, ha='right',
        bbox=dict(boxstyle='round,pad=0.4', facecolor='#e8f4f8', edgecolor='#aec6cf'))

# 凡例
red_patch  = mpatches.Patch(color='#d62728', alpha=0.85, label='負の効果（死亡率を下げる）')
blue_patch = mpatches.Patch(color='#1f77b4', alpha=0.85, label='正の効果（死亡率を上げる）')
ax.legend(handles=[blue_patch, red_patch], fontsize=8.5, loc='lower right')

fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, '2019_H1_fig4.png'), bbox_inches='tight')
plt.close(fig)
print("図4 保存完了: 2019_H1_fig4.png")

# ── 最終サマリー印刷 ───────────────────────────────────────────────────────────
print("\n" + "=" * 60)
print("分析サマリー")
print("=" * 60)
print(f"使用データ: SSDSE-B-2026.csv (2019年度, N=47都道府県)")
print(f"Gini係数（一般病院数/万人）: {g_hospital:.4f}")
print(f"Gini係数（一般診療所数/万人）: {g_clinic:.4f}")
print(f"\n相関分析 (病院数密度 vs 死亡率): r={r:.3f}, p={p:.4f}")
print(f"重回帰 R²={model.rsquared:.3f}, adj.R²={model.rsquared_adj:.3f}")
print("\n病院数/万人 上位5:")
print(df.nlargest(5, '病院数per万人')[['都道府県', '病院数per万人']].to_string(index=False))
print("\n病院数/万人 下位5:")
print(df.nsmallest(5, '病院数per万人')[['都道府県', '病院数per万人']].to_string(index=False))

print("\nDONE: 2019_H1_daijin")