"""
2023_H2 分析スクリプト: 大腸がん罹患要因の探究と罹患しにくい生活の提案
==========================================================================
【論文】大腸がん罹患要因の探究と罹患しにくい生活の提案
        2023年度 統計データ分析コンペティション 優秀賞（高校生の部）

【概要】
  大腸がん罹患率のデータがSSDSE-Bに含まれないため、
  保健医療費割合（保健医療費 / 消費支出 × 100）を罹患負担プロキシとして使用。
  食生活・高齢化・経済環境・医療アクセスなどとの相関分析と外れ値分析を実施。

【手法】
  1. Pearson相関分析（全予測変数 vs 保健医療費割合）
  2. 散布図：高齢化率 vs 保健医療費割合（外れ値ハイライト + 回帰線2本）
  3. 散布図：食料費割合 vs 保健医療費割合
  4. 散布図：消費支出 vs 保健医療費割合（都道府県ラベル付き）

【入力】
  data/raw/SSDSE-B-2026.csv（2022年データ、47都道府県）

【出力】
  html/figures/2023_H2_fig1_corr.png    ... 相関係数棒グラフ（絶対値順）
  html/figures/2023_H2_fig2_aging.png   ... 高齢化率 vs 保健医療費割合
  html/figures/2023_H2_fig3_food.png    ... 食料費割合 vs 保健医療費割合
  html/figures/2023_H2_fig4_spend.png   ... 消費支出 vs 保健医療費割合

【データ出典】
  SSDSE-B-2026.csv: 社会・人口統計体系（都道府県データ）
  統計センター公表の実データ。合成データは一切使用しない。
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2023_H2_yushu.py
# ============================================================


import os
import warnings
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy import stats

warnings.filterwarnings('ignore')

# ── パス設定 ──────────────────────────────────────────────────────────
DATA_DIR = 'data/raw'
FIG_DIR  = 'html/figures'
os.makedirs(FIG_DIR, exist_ok=True)

plt.rcParams.update({
    'font.family': 'Hiragino Sans',
    'axes.unicode_minus': False,
    'figure.dpi': 150,
    'axes.spines.top': False,
    'axes.spines.right': False,
})

# ════════════════════════════════════════════════════════════════════
# ■ データ読み込み（実データのみ・合成データなし）
# ════════════════════════════════════════════════════════════════════
df_raw = pd.read_csv(
    os.path.join(DATA_DIR, 'SSDSE-B-2026.csv'),
    encoding='cp932', header=1
)

# 2022年度・都道府県コードのみ抽出（47都道府県）
df = df_raw[
    (df_raw['年度'] == 2022) &
    df_raw['地域コード'].str.match(r'^R\d{5}$', na=False)
].copy().reset_index(drop=True)

assert len(df) == 47, f"Expected 47 rows, got {len(df)}"
print(f"=== データ読み込み完了 ===")
print(f"  年度: 2022, 都道府県数: {len(df)}")

# ── 数値変換 ─────────────────────────────────────────────────────────
num_cols = [
    '総人口', '65歳以上人口', '合計特殊出生率',
    '消費支出（二人以上の世帯）', '食料費（二人以上の世帯）',
    '保健医療費（二人以上の世帯）', '教養娯楽費（二人以上の世帯）',
    '光熱・水道費（二人以上の世帯）', '被服及び履物費（二人以上の世帯）',
    '交通・通信費（二人以上の世帯）', '教育費（二人以上の世帯）',
    '歯科診療所数', '一般診療所数',
]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

# ════════════════════════════════════════════════════════════════════
# ■ 変数エンジニアリング
# ════════════════════════════════════════════════════════════════════
pop     = df['総人口'].values.clip(1)
spend   = df['消費支出（二人以上の世帯）'].values.astype(float).clip(1)

# 目的変数：保健医療費割合（%）= 保健医療費 / 消費支出 × 100
health_share = df['保健医療費（二人以上の世帯）'].values / spend * 100

# 説明変数群
aging_rate   = df['65歳以上人口'].values / pop * 100          # 高齢化率（%）
food_share   = df['食料費（二人以上の世帯）'].values / spend * 100     # 食料費割合（%）
leisure_share= df['教養娯楽費（二人以上の世帯）'].values / spend * 100  # 教養娯楽費割合（%）
heat_share   = df['光熱・水道費（二人以上の世帯）'].values / spend * 100  # 光熱・水道費割合（%）
cloth_share  = df['被服及び履物費（二人以上の世帯）'].values / spend * 100 # 被服費割合（%）
transp_share = df['交通・通信費（二人以上の世帯）'].values / spend * 100  # 交通・通信費割合（%）
edu_share    = df['教育費（二人以上の世帯）'].values / spend * 100       # 教育費割合（%）
tfr          = df['合計特殊出生率'].values.astype(float)      # 合計特殊出生率
dental_density = df['歯科診療所数'].values / pop * 10000      # 歯科診療所密度（人口1万対）
clinic_density = df['一般診療所数'].values / pop * 10000      # 一般診療所密度（人口1万対）
log_spend    = np.log(spend)                                   # 消費支出（対数）

pref = df['都道府県'].values

# ════════════════════════════════════════════════════════════════════
# ■ Pearson相関分析（全予測変数 vs 保健医療費割合）
# ════════════════════════════════════════════════════════════════════
predictors = {
    '高齢化率 (%)':           aging_rate,
    '食料費割合 (%)':         food_share,
    '教養娯楽費割合 (%)':     leisure_share,
    '光熱・水道費割合 (%)':   heat_share,
    '被服費割合 (%)':         cloth_share,
    '交通・通信費割合 (%)':   transp_share,
    '教育費割合 (%)':         edu_share,
    '合計特殊出生率':         tfr,
    '歯科診療所密度（万対）': dental_density,
    '一般診療所密度（万対）': clinic_density,
    '消費支出（対数）':       log_spend,
}

print("\n=== Pearson 相関係数（vs 保健医療費割合） ===")
corr_results = {}
for name, vals in predictors.items():
    mask = ~(np.isnan(vals) | np.isnan(health_share))
    if mask.sum() < 5:
        continue
    r, p = stats.pearsonr(vals[mask], health_share[mask])
    corr_results[name] = {'r': r, 'p': p}
    sig = '**' if p < 0.01 else ('*' if p < 0.05 else 'ns')
    print(f"  {name:25s}  r={r:+.4f}  p={p:.4f}  {sig}")

# ════════════════════════════════════════════════════════════════════
# ■ 外れ値の特定（高齢化率 vs 保健医療費割合）
# ════════════════════════════════════════════════════════════════════
# 保健医療費割合で最高・最低の都道府県
idx_max = np.argmax(health_share)
idx_min = np.argmin(health_share)
outlier_idx = idx_max   # 最高値を外れ値として扱う
outlier_pref = pref[outlier_idx]

print(f"\n=== 保健医療費割合の外れ値 ===")
print(f"  最高: {pref[idx_max]} ({health_share[idx_max]:.2f}%)")
print(f"  最低: {pref[idx_min]} ({health_share[idx_min]:.2f}%)")
print(f"  全体平均: {health_share.mean():.2f}%  SD: {health_share.std():.2f}%")

# ════════════════════════════════════════════════════════════════════
# ■ Figure 1: 相関係数棒グラフ（絶対値順）
# ════════════════════════════════════════════════════════════════════
sorted_items = sorted(corr_results.items(), key=lambda x: abs(x[1]['r']), reverse=True)
names_sorted = [x[0] for x in sorted_items]
r_sorted     = [x[1]['r'] for x in sorted_items]
p_sorted     = [x[1]['p'] for x in sorted_items]
colors_bar   = ['#C62828' if p < 0.05 else '#90A4AE' for p in p_sorted]

fig, ax = plt.subplots(figsize=(9, 6))
bars = ax.barh(range(len(names_sorted)), r_sorted,
               color=colors_bar, alpha=0.85, height=0.65)
ax.set_yticks(range(len(names_sorted)))
ax.set_yticklabels(names_sorted, fontsize=10)
ax.axvline(0, color='black', lw=1.0)
ax.set_xlabel('Pearson 相関係数 r', fontsize=11)
ax.set_title('各指標と保健医療費割合の相関係数\n（赤：p<0.05有意、灰：非有意）',
             fontsize=13, fontweight='bold')
ax.set_xlim(-1, 1)

for i, (bar, r, p) in enumerate(zip(bars, r_sorted, p_sorted)):
    xpos = bar.get_width() + (0.02 if r >= 0 else -0.02)
    ha   = 'left' if r >= 0 else 'right'
    label = f'r={r:.2f}{"*" if p < 0.05 else ""}'
    ax.text(xpos, i, label, va='center', fontsize=8.5, ha=ha)

import matplotlib.patches as mpatches
sig_patch = mpatches.Patch(color='#C62828', alpha=0.85, label='有意 (p<0.05)')
ns_patch  = mpatches.Patch(color='#90A4AE', alpha=0.85, label='非有意')
ax.legend(handles=[sig_patch, ns_patch], fontsize=9, loc='lower right')

plt.tight_layout()
fig1_path = os.path.join(FIG_DIR, '2023_H2_fig1_corr.png')
plt.savefig(fig1_path, bbox_inches='tight')
plt.close()
print(f"\nFigure 1 saved: {fig1_path}")

# ════════════════════════════════════════════════════════════════════
# ■ Figure 2: 高齢化率 vs 保健医療費割合（外れ値ハイライト + 回帰線2本）
# ════════════════════════════════════════════════════════════════════
x2 = aging_rate
y2 = health_share
mask_valid = ~(np.isnan(x2) | np.isnan(y2))
x2v, y2v = x2[mask_valid], y2[mask_valid]
pref_v = pref[mask_valid]

# 外れ値マスク（保健医療費割合が最高の都道府県）
is_outlier = (pref_v == outlier_pref)

# 全データ回帰
slope_all, intercept_all, r_all, p_all, _ = stats.linregress(x2v, y2v)
# 外れ値除外回帰
xno = x2v[~is_outlier]
yno = y2v[~is_outlier]
slope_no, intercept_no, r_no, p_no, _ = stats.linregress(xno, yno)

fig, ax = plt.subplots(figsize=(8, 6))

# 通常の点
ax.scatter(x2v[~is_outlier], y2v[~is_outlier],
           color='#1565C0', alpha=0.7, s=55, zorder=3, label='都道府県')

# 外れ値の点
ax.scatter(x2v[is_outlier], y2v[is_outlier],
           color='#C62828', s=120, zorder=5, marker='*',
           label=f'外れ値：{outlier_pref}')
for xi, yi, pi in zip(x2v[is_outlier], y2v[is_outlier], pref_v[is_outlier]):
    ax.annotate(pi, (xi, yi), textcoords='offset points', xytext=(6, 4),
                fontsize=9, color='#C62828', fontweight='bold')

# 回帰線（全データ）
x_line = np.linspace(x2v.min(), x2v.max(), 200)
ax.plot(x_line, slope_all * x_line + intercept_all,
        color='#1565C0', lw=2, ls='-',
        label=f'全都道府県 (r={r_all:+.3f}, p={p_all:.3f})')
# 回帰線（外れ値除外）
ax.plot(x_line, slope_no * x_line + intercept_no,
        color='#E65100', lw=2, ls='--',
        label=f'外れ値除外 (r={r_no:+.3f}, p={p_no:.3f})')

ax.set_xlabel('高齢化率 (%)', fontsize=12)
ax.set_ylabel('保健医療費割合 (%)', fontsize=12)
ax.set_title('高齢化率と保健医療費割合の関係\n（外れ値の影響を確認）',
             fontsize=13, fontweight='bold')
ax.legend(fontsize=9)

plt.tight_layout()
fig2_path = os.path.join(FIG_DIR, '2023_H2_fig2_aging.png')
plt.savefig(fig2_path, bbox_inches='tight')
plt.close()
print(f"Figure 2 saved: {fig2_path}")

print(f"  全データ:    r={r_all:+.4f}  p={p_all:.4f}")
print(f"  外れ値除外:  r={r_no:+.4f}  p={p_no:.4f}")

# ════════════════════════════════════════════════════════════════════
# ■ Figure 3: 食料費割合 vs 保健医療費割合
# ════════════════════════════════════════════════════════════════════
x3 = food_share
y3 = health_share
mask3 = ~(np.isnan(x3) | np.isnan(y3))
x3v, y3v, pref3 = x3[mask3], y3[mask3], pref[mask3]

slope3, intercept3, r3, p3, _ = stats.linregress(x3v, y3v)

fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(x3v, y3v, color='#2E7D32', alpha=0.75, s=55, zorder=3)

# 注目点（最高保健医療費）
for xi, yi, pi in zip(x3v, y3v, pref3):
    if pi in [outlier_pref, pref[idx_min]]:
        ax.annotate(pi, (xi, yi), textcoords='offset points', xytext=(6, 3),
                    fontsize=8.5, color='#C62828', fontweight='bold')
        ax.scatter([xi], [yi], color='#C62828', s=80, zorder=5)

x_line3 = np.linspace(x3v.min(), x3v.max(), 200)
ax.plot(x_line3, slope3 * x_line3 + intercept3,
        color='#2E7D32', lw=2,
        label=f'回帰直線 (r={r3:+.3f}, p={p3:.3f})')

ax.set_xlabel('食料費割合 (%)', fontsize=12)
ax.set_ylabel('保健医療費割合 (%)', fontsize=12)
ax.set_title('食料費割合と保健医療費割合の関係\n（食生活と疾病負担の代理変数）',
             fontsize=13, fontweight='bold')
ax.legend(fontsize=10)

plt.tight_layout()
fig3_path = os.path.join(FIG_DIR, '2023_H2_fig3_food.png')
plt.savefig(fig3_path, bbox_inches='tight')
plt.close()
print(f"Figure 3 saved: {fig3_path}")
print(f"  食料費割合 vs 保健医療費割合: r={r3:+.4f}  p={p3:.4f}")

# ════════════════════════════════════════════════════════════════════
# ■ Figure 4: 消費支出 vs 保健医療費割合（都道府県ラベル付き）
# ════════════════════════════════════════════════════════════════════
x4 = spend / 1000   # 千円単位
y4 = health_share
mask4 = ~(np.isnan(x4) | np.isnan(y4))
x4v, y4v, pref4 = x4[mask4], y4[mask4], pref[mask4]

slope4, intercept4, r4, p4, _ = stats.linregress(x4v, y4v)

fig, ax = plt.subplots(figsize=(11, 7))
ax.scatter(x4v, y4v, color='#6A1B9A', alpha=0.7, s=50, zorder=3)

# 全都道府県ラベル（小さめ）
for xi, yi, pi in zip(x4v, y4v, pref4):
    short = pi.replace('都', '').replace('道', '').replace('府', '').replace('県', '')
    color = '#C62828' if pi == outlier_pref else '#333333'
    weight = 'bold' if pi == outlier_pref else 'normal'
    ax.text(xi, yi + 0.04, short, fontsize=6.5, ha='center',
            color=color, fontweight=weight)

x_line4 = np.linspace(x4v.min(), x4v.max(), 200)
ax.plot(x_line4, slope4 * x_line4 + intercept4,
        color='#6A1B9A', lw=2,
        label=f'回帰直線 (r={r4:+.3f}, p={p4:.3f})')

ax.set_xlabel('消費支出（千円/月・二人以上世帯）', fontsize=12)
ax.set_ylabel('保健医療費割合 (%)', fontsize=12)
ax.set_title('消費支出と保健医療費割合の関係（都道府県別）\n'
             '（赤字：外れ値都道府県）',
             fontsize=13, fontweight='bold')
ax.legend(fontsize=10)

plt.tight_layout()
fig4_path = os.path.join(FIG_DIR, '2023_H2_fig4_spend.png')
plt.savefig(fig4_path, bbox_inches='tight')
plt.close()
print(f"Figure 4 saved: {fig4_path}")
print(f"  消費支出 vs 保健医療費割合: r={r4:+.4f}  p={p4:.4f}")

# ════════════════════════════════════════════════════════════════════
# ■ 結果サマリー
# ════════════════════════════════════════════════════════════════════
print("\n=== 分析サマリー ===")
print(f"  目的変数: 保健医療費割合（保健医療費/消費支出×100）")
print(f"  データ: SSDSE-B-2026.csv 2022年度 47都道府県")
print(f"  外れ値（最高）: {pref[idx_max]}  {health_share[idx_max]:.2f}%")
print(f"  外れ値（最低）: {pref[idx_min]}  {health_share[idx_min]:.2f}%")
print(f"  平均: {health_share.mean():.2f}%  SD: {health_share.std():.2f}%")
print("\n上位相関変数（絶対値）:")
for name, res in sorted_items[:5]:
    sig = '有意' if res['p'] < 0.05 else '非有意'
    print(f"  {name:25s}  r={res['r']:+.4f}  p={res['p']:.4f}  ({sig})")
print("\n分析完了。html/figures/ に図を保存しました。")