"""
教育用再現コード: 2024年 統計データ分析コンペティション 審査員奨励賞 [高校生の部]
=================================================================
論文タイトル：う蝕罹患に関する要因の研究とよりよい生活の提案
著者：徐煌哲（かえつ有明高等学校）

【分析概要・実データ版】
  データ：SSDSE-B-2026.csv（2022年度・47都道府県）
  目的変数：dental_per_10k（歯科診療所数 per 10,000人 = I5103 / A1101 × 10000）
            ※元論文のう歯本数データはSSDS-Bに含まれないため、
              歯科診療所密度を口腔保健インフラの代理指標として使用
  説明変数：
    食料費割合         L322101 / L3221 × 100
    食料費             L322101（食生活指標）
    合計特殊出生率     A4103
    消費支出           L3221
    高齢化率           A1303 / A1101 × 100
    15歳未満女性割合   A130102 / A110102 × 100
    小学校密度         E2101 / A1101 × 10000
    住宅地標準価格     C5401

  Step1. 全説明変数との相関分析（47都道府県、|r|順棒グラフ）
  Step2. 外れ値（東京都）の特定と除外・再分析（高齢化率軸で散布図）
  Step3. 消費支出 vs 歯科診療所密度（外れ値あり・なし比較）
  Step4. 食料費割合 vs 歯科診療所密度（回帰直線付き）

【主要な発見】
  最強の正相関：住宅地標準価格（r=0.647）→ 経済力が歯科インフラを牽引
  最強の負相関：合計特殊出生率（r=-0.382）
  東京都は歯科診療所密度が突出して高い外れ値（7.6診療所/万人）

【データサイエンス学習ポイント】
  1. 公的統計（SSDSE-B）のみを使用した再現可能な分析
  2. 直接データがない場合の代理変数（proxy）の設定
  3. 外れ値の特定と除外が相関係数に与える影響
  4. 相関から因果への慎重な推論
=================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2024_H5_4_shorei.py
# ============================================================


import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy import stats
import os
import warnings
warnings.filterwarnings('ignore')

# ──────────────────────────────────────────────────────────────
# 共通設定
# ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

DATA_PATH = 'data/raw/SSDSE-B-2026.csv'
FIG_DIR   = 'html/figures'
os.makedirs(FIG_DIR, exist_ok=True)

# ================================================================
# ■ データ読み込み（SSDSE-B-2026、2022年度、都道府県レベル）
# ================================================================
# header=0: 1行目がコード（A1101 等）→ それを列名として使用
# その後 iloc[1:] で日本語名行を除去し、都道府県コード行だけ抽出
df_raw = pd.read_csv(DATA_PATH, encoding='cp932', header=0)
df_raw = df_raw.iloc[1:].reset_index(drop=True)
df_raw = df_raw.rename(columns={
    'SSDSE-B-2026': '年度',
    'Code':         '地域コード',
    'Prefecture':   '都道府県',
})

# 都道府県レベル（R\d{5} 形式）のみ・2022年度に絞る
df_raw = df_raw[df_raw['地域コード'].str.match(r'^R\d{5}$', na=False)]
df_b   = df_raw[df_raw['年度'] == '2022'].copy().reset_index(drop=True)

# 数値化
num_cols = ['A1101', 'A1303', 'A130102', 'A110102',
            'A4103', 'E2101', 'C5401', 'L3221', 'L322101', 'I5103']
for col in num_cols:
    df_b[col] = pd.to_numeric(df_b[col], errors='coerce')

df_b = df_b.dropna(subset=num_cols).reset_index(drop=True)

# ================================================================
# ■ 派生変数の計算
# ================================================================
df_b['dental_per_10k']    = df_b['I5103']    / df_b['A1101']   * 10000   # 歯科診療所数/万人
df_b['aging_rate']        = df_b['A1303']    / df_b['A1101']   * 100     # 高齢化率（%）
df_b['child_female_ratio']= df_b['A130102'] / df_b['A110102'] * 100     # 15歳未満女性割合（%）
df_b['food_ratio']        = df_b['L322101'] / df_b['L3221']   * 100     # 食料費割合（%）
df_b['school_density']    = df_b['E2101']   / df_b['A1101']   * 10000   # 小学校密度（校/万人）

OUTCOME = 'dental_per_10k'
OUTCOME_LABEL = '歯科診療所密度（診療所数/万人）'

# 説明変数の定義：コード → 表示名
PREDICTORS = {
    '食料費割合（%）':           'food_ratio',
    '食料費（円/月）':           'L322101',
    '合計特殊出生率':             'A4103',
    '消費支出（円/月）':         'L3221',
    '高齢化率（%）':             'aging_rate',
    '15歳未満女性割合（%）':     'child_female_ratio',
    '小学校密度（校/万人）':     'school_density',
    '住宅地標準価格（円/m²）':   'C5401',
}
PRED_LABELS = list(PREDICTORS.keys())
PRED_COLS   = list(PREDICTORS.values())

N = len(df_b)
print("=" * 65)
print(f"■ データ概要: 2022年度 N={N}都道府県（SSDSE-B-2026）")
print("=" * 65)
print(df_b[['都道府県', OUTCOME] + PRED_COLS].describe().round(3))

# ================================================================
# ■ 外れ値の特定
# ================================================================
outlier_idx  = df_b[OUTCOME].idxmax()
outlier_pref = df_b.loc[outlier_idx, '都道府県']
mask_out     = df_b.index != outlier_idx      # 外れ値除外マスク（True = 外れ値以外）
df_excl      = df_b[mask_out].copy()
N_excl       = len(df_excl)

print(f"\n外れ値: {outlier_pref}（{OUTCOME}={df_b.loc[outlier_idx, OUTCOME]:.3f}）")

# ================================================================
# ■ Step1. 全説明変数との相関分析
# ================================================================
print("\n" + "=" * 65)
print("■ Step1. 各説明変数と歯科診療所密度の相関分析（N=47）")
print("=" * 65)
print(f"\n  {'変数':<22} {'r':>8} {'p値':>10} {'有意':>6}")
print("  " + "-" * 50)

corrs_all  = []
corrs_excl = []
for label, col in zip(PRED_LABELS, PRED_COLS):
    r_all,  p_all  = stats.pearsonr(df_b[col],    df_b[OUTCOME])
    r_excl, p_excl = stats.pearsonr(df_excl[col], df_excl[OUTCOME])
    corrs_all.append(r_all)
    corrs_excl.append(r_excl)
    sig = "**" if p_all < 0.01 else ("*" if p_all < 0.05 else "")
    print(f"  {label:<22} {r_all:>8.4f} {p_all:>10.4f} {sig:>6}")

print(f"\n外れ値（{outlier_pref}）除外後（N={N_excl}）:")
print(f"\n  {'変数':<22} {'r':>8} {'p値':>10} {'有意':>6}")
print("  " + "-" * 50)
for label, col, r_excl in zip(PRED_LABELS, PRED_COLS, corrs_excl):
    _, p_excl = stats.pearsonr(df_excl[col], df_excl[OUTCOME])
    sig = "**" if p_excl < 0.01 else ("*" if p_excl < 0.05 else "")
    print(f"  {label:<22} {r_excl:>8.4f} {p_excl:>10.4f} {sig:>6}")

# ================================================================
# ■ 図の生成（4枚）
# ================================================================

# ────────────────────────────────────────────────────────────────
# 図1: 相関係数棒グラフ（|r|順ソート）
# ────────────────────────────────────────────────────────────────
sorted_pairs = sorted(zip(PRED_LABELS, corrs_all), key=lambda x: abs(x[1]))
labels_sorted = [p[0] for p in sorted_pairs]
corrs_sorted  = [p[1] for p in sorted_pairs]
colors_sorted = ['#C62828' if r >= 0 else '#1565C0' for r in corrs_sorted]

fig1, ax1 = plt.subplots(figsize=(10, 6))
bars = ax1.barh(labels_sorted, corrs_sorted, color=colors_sorted, alpha=0.78, edgecolor='white')

for bar, r in zip(bars, corrs_sorted):
    xpos = bar.get_width() + (0.015 if r >= 0 else -0.015)
    ha   = 'left' if r >= 0 else 'right'
    ax1.text(xpos, bar.get_y() + bar.get_height() / 2,
             f'{r:.3f}', va='center', ha=ha, fontsize=9)

ax1.axvline(0, color='gray', linestyle='--', linewidth=0.9)
ax1.axvline( 0.3, color='gray', linestyle=':', linewidth=0.7, alpha=0.5)
ax1.axvline(-0.3, color='gray', linestyle=':', linewidth=0.7, alpha=0.5)
ax1.set_xlabel('ピアソン相関係数', fontsize=11)
ax1.set_title(f'各説明変数と歯科診療所密度の相関係数（N={N}都道府県、2022年度）\n'
              '正（赤）= 高密度地域と正の関係、負（青）= 逆の関係', fontsize=11, fontweight='bold')
ax1.set_xlim(-0.85, 0.85)
ax1.grid(axis='x', alpha=0.3)

plt.tight_layout()
out1 = os.path.join(FIG_DIR, '2024_H5_4_fig1_scatter.png')
fig1.savefig(out1, bbox_inches='tight', dpi=150)
plt.close(fig1)
print(f"\n図1保存: 2024_H5_4_fig1_scatter.png")

# ────────────────────────────────────────────────────────────────
# 図2: 高齢化率 vs 歯科診療所密度（東京都ハイライト）
# ────────────────────────────────────────────────────────────────
col2  = 'aging_rate'
lab2  = '高齢化率（%）'
r2a, p2a = stats.pearsonr(df_b[col2],    df_b[OUTCOME])
r2e, p2e = stats.pearsonr(df_excl[col2], df_excl[OUTCOME])

fig2, ax2 = plt.subplots(figsize=(8, 6))
ax2.scatter(df_excl[col2], df_excl[OUTCOME],
            color='#1565C0', s=55, alpha=0.7, zorder=3, label='各都道府県')
ax2.scatter(df_b.loc[outlier_idx, col2], df_b.loc[outlier_idx, OUTCOME],
            color='#C62828', s=180, zorder=5, marker='*', label=f'{outlier_pref}（外れ値候補）')
ax2.annotate(outlier_pref,
             (df_b.loc[outlier_idx, col2], df_b.loc[outlier_idx, OUTCOME]),
             textcoords='offset points', xytext=(8, 4),
             fontsize=10, color='#C62828', fontweight='bold')

# 全体の回帰直線
x_range = [df_b[col2].min() - 0.5, df_b[col2].max() + 0.5]
slope_a, intercept_a, *_ = stats.linregress(df_b[col2], df_b[OUTCOME])
ax2.plot(x_range, [slope_a * x + intercept_a for x in x_range],
         'b--', linewidth=1.5, alpha=0.6, label=f'回帰（全体）r={r2a:.3f}')

# 除外後の回帰直線
slope_e, intercept_e, *_ = stats.linregress(df_excl[col2], df_excl[OUTCOME])
ax2.plot(x_range, [slope_e * x + intercept_e for x in x_range],
         'r-', linewidth=2, alpha=0.8, label=f'回帰（{outlier_pref}除外）r={r2e:.3f}')

ax2.set_xlabel(lab2, fontsize=11)
ax2.set_ylabel(OUTCOME_LABEL, fontsize=11)
ax2.set_title(f'高齢化率と歯科診療所密度の関係\n（{outlier_pref}の外れ値効果を確認）',
              fontsize=12, fontweight='bold')
ax2.legend(fontsize=9)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
out2 = os.path.join(FIG_DIR, '2024_H5_4_fig2_corr.png')
fig2.savefig(out2, bbox_inches='tight', dpi=150)
plt.close(fig2)
print("図2保存: 2024_H5_4_fig2_corr.png")

# ────────────────────────────────────────────────────────────────
# 図3: 消費支出 vs 歯科診療所密度（外れ値あり・なし比較）
# ────────────────────────────────────────────────────────────────
col3  = 'L3221'
lab3  = '消費支出（円/月）'
r3a, p3a = stats.pearsonr(df_b[col3],    df_b[OUTCOME])
r3e, p3e = stats.pearsonr(df_excl[col3], df_excl[OUTCOME])

fig3, ax3 = plt.subplots(figsize=(8, 6))
ax3.scatter(df_excl[col3], df_excl[OUTCOME],
            color='#2E7D32', s=55, alpha=0.7, zorder=3, label=f'各都道府県（{outlier_pref}除外）')
ax3.scatter(df_b.loc[outlier_idx, col3], df_b.loc[outlier_idx, OUTCOME],
            color='#C62828', s=180, zorder=5, marker='*', label=f'{outlier_pref}')
ax3.annotate(outlier_pref,
             (df_b.loc[outlier_idx, col3], df_b.loc[outlier_idx, OUTCOME]),
             textcoords='offset points', xytext=(5, 6),
             fontsize=9, color='#C62828', fontweight='bold')

x_min3 = df_b[col3].min() - 5000
x_max3 = df_b[col3].max() + 5000
x_rng3 = [x_min3, x_max3]

# 全体の回帰
s3a, i3a, *_ = stats.linregress(df_b[col3], df_b[OUTCOME])
ax3.plot(x_rng3, [s3a * x + i3a for x in x_rng3],
         'b--', linewidth=1.5, alpha=0.6, label=f'回帰（全体）r={r3a:.3f}, p={p3a:.3f}')

# 除外後の回帰
s3e, i3e, *_ = stats.linregress(df_excl[col3], df_excl[OUTCOME])
ax3.plot(x_rng3, [s3e * x + i3e for x in x_rng3],
         'g-', linewidth=2, alpha=0.85, label=f'回帰（{outlier_pref}除外）r={r3e:.3f}, p={p3e:.3f}')

ax3.set_xlabel(lab3, fontsize=11)
ax3.set_ylabel(OUTCOME_LABEL, fontsize=11)
ax3.set_title(f'消費支出と歯科診療所密度の関係\n（{outlier_pref}除外の影響比較）',
              fontsize=12, fontweight='bold')
ax3.legend(fontsize=9)
ax3.grid(True, alpha=0.3)
ax3.ticklabel_format(style='plain', axis='x')

plt.tight_layout()
out3 = os.path.join(FIG_DIR, '2024_H5_4_fig3_outlier.png')
fig3.savefig(out3, bbox_inches='tight', dpi=150)
plt.close(fig3)
print("図3保存: 2024_H5_4_fig3_outlier.png")

# ────────────────────────────────────────────────────────────────
# 図4: 食料費割合 vs 歯科診療所密度（回帰直線付き）
# ────────────────────────────────────────────────────────────────
col4  = 'food_ratio'
lab4  = '食料費割合（%）'
r4,  p4  = stats.pearsonr(df_b[col4], df_b[OUTCOME])

fig4, ax4 = plt.subplots(figsize=(8, 6))
ax4.scatter(df_b[col4], df_b[OUTCOME],
            color='#6A1B9A', s=55, alpha=0.75, zorder=3, label='各都道府県')
ax4.scatter(df_b.loc[outlier_idx, col4], df_b.loc[outlier_idx, OUTCOME],
            color='#C62828', s=180, zorder=5, marker='*', label=f'{outlier_pref}')
ax4.annotate(outlier_pref,
             (df_b.loc[outlier_idx, col4], df_b.loc[outlier_idx, OUTCOME]),
             textcoords='offset points', xytext=(5, 5),
             fontsize=9, color='#C62828', fontweight='bold')

x_min4 = df_b[col4].min() - 0.3
x_max4 = df_b[col4].max() + 0.3
x_rng4 = [x_min4, x_max4]
s4, i4, *_ = stats.linregress(df_b[col4], df_b[OUTCOME])
ax4.plot(x_rng4, [s4 * x + i4 for x in x_rng4],
         color='#6A1B9A', linewidth=2, alpha=0.8,
         label=f'回帰直線 r={r4:.3f}, p={p4:.3f}')

ax4.set_xlabel(lab4, fontsize=11)
ax4.set_ylabel(OUTCOME_LABEL, fontsize=11)
ax4.set_title(f'食料費割合と歯科診療所密度の関係（N={N}都道府県、2022年度）\n'
              f'r={r4:.3f}, p={p4:.3f}', fontsize=12, fontweight='bold')
ax4.text(0.05, 0.93,
         '食料費割合が高い地域ほど\n歯科診療所密度の傾向を確認',
         transform=ax4.transAxes, fontsize=9, color='#444',
         bbox=dict(facecolor='#F3E5F5', edgecolor='#6A1B9A', boxstyle='round,pad=0.4'))
ax4.legend(fontsize=9)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
out4 = os.path.join(FIG_DIR, '2024_H5_4_fig4_income.png')
fig4.savefig(out4, bbox_inches='tight', dpi=150)
plt.close(fig4)
print("図4保存: 2024_H5_4_fig4_income.png")

print("\n全図の生成完了（4枚）")