"""
2020年度 統計データ分析コンペティション 審査員奨励賞（大学生・一般の部）
「食費支出と食生活・健康指標の地域格差分析」
手法: 相関分析、重回帰分析、地域比較
データ: SSDSE-B-2026.csv（2022年断面）
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2020_U5_4_shorei.py
# ============================================================


import os, numpy as np, pandas as pd
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats

plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

df_b = pd.read_csv(DATA_B, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}', na=False)].copy()
df_b['年度'] = df_b['年度'].astype(int)

# ── 2022年断面データ ──
df22 = df_b[df_b['年度'] == 2022].copy().reset_index(drop=True)

# 派生変数の計算
df22['食料費割合'] = df22['食料費（二人以上の世帯）'] / df22['消費支出（二人以上の世帯）'] * 100
df22['高齢化率']   = df22['65歳以上人口'] / df22['総人口'] * 100
df22['保育所密度'] = df22['保育所等数'] / df22['総人口'] * 100000

# 地域区分マップ
region_map = {
    '北海道': '北海道・東北', '青森県': '北海道・東北', '岩手県': '北海道・東北',
    '宮城県': '北海道・東北', '秋田県': '北海道・東北', '山形県': '北海道・東北',
    '福島県': '北海道・東北', '茨城県': '関東', '栃木県': '関東', '群馬県': '関東',
    '埼玉県': '関東', '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国・四国', '島根県': '中国・四国', '岡山県': '中国・四国',
    '広島県': '中国・四国', '山口県': '中国・四国', '徳島県': '中国・四国',
    '香川県': '中国・四国', '愛媛県': '中国・四国', '高知県': '中国・四国',
    '福岡県': '九州・沖縄', '佐賀県': '九州・沖縄', '長崎県': '九州・沖縄',
    '熊本県': '九州・沖縄', '大分県': '九州・沖縄', '宮崎県': '九州・沖縄',
    '鹿児島県': '九州・沖縄', '沖縄県': '九州・沖縄'
}
region_colors = {'北海道・東北': '#4e9af1', '関東': '#e05c5c', '中部': '#f0a500',
                 '近畿': '#5cb85c', '中国・四国': '#9b59b6', '九州・沖縄': '#f39c12'}

df22['地域'] = df22['都道府県'].map(region_map)

# ════════════════════════════════════════════════════════════
# 図1: 都道府県別食料費ランキング（地域色分け）
# ════════════════════════════════════════════════════════════
df_rank = df22[['都道府県', '食料費（二人以上の世帯）', '地域']].copy()
df_rank = df_rank.sort_values('食料費（二人以上の世帯）', ascending=True).reset_index(drop=True)

fig1, ax1 = plt.subplots(figsize=(10, 12))
colors_bar = [region_colors[r] for r in df_rank['地域']]
bars = ax1.barh(df_rank['都道府県'], df_rank['食料費（二人以上の世帯）'],
                color=colors_bar, edgecolor='white', linewidth=0.5)

# 平均線
mean_food = df22['食料費（二人以上の世帯）'].mean()
ax1.axvline(mean_food, color='black', linestyle='--', linewidth=1.2, alpha=0.7)
ax1.text(mean_food + 200, 46, f'全国平均\n{mean_food:,.0f}円', fontsize=9, va='top', ha='left')

ax1.set_xlabel('食料費（円/月、二人以上の世帯）', fontsize=12)
ax1.set_title('都道府県別 食料費ランキング（2022年）', fontsize=14, fontweight='bold', pad=12)
ax1.set_xlim(60000, 95000)
ax1.tick_params(axis='y', labelsize=8.5)

# 凡例
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=v, label=k) for k, v in region_colors.items()]
ax1.legend(handles=legend_elements, loc='lower right', fontsize=9, framealpha=0.8)

plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2020_U5_4_fig1.png'), bbox_inches='tight')
plt.close(fig1)
print('[OK] fig1 saved')

# ════════════════════════════════════════════════════════════
# 図2: 散布図 消費支出 vs 食料費（エンゲル係数的観点）
# ════════════════════════════════════════════════════════════
fig2, ax2 = plt.subplots(figsize=(9, 7))

for region, grp in df22.groupby('地域'):
    ax2.scatter(grp['消費支出（二人以上の世帯）'], grp['食料費（二人以上の世帯）'],
                color=region_colors[region], label=region, s=60, zorder=3, alpha=0.85)
    for _, row in grp.iterrows():
        ax2.annotate(row['都道府県'],
                     xy=(row['消費支出（二人以上の世帯）'], row['食料費（二人以上の世帯）']),
                     fontsize=6.5, alpha=0.8, ha='center', va='bottom',
                     xytext=(0, 4), textcoords='offset points')

# 回帰線
x_arr = df22['消費支出（二人以上の世帯）'].values
y_arr = df22['食料費（二人以上の世帯）'].values
slope, intercept, r_val, p_val, _ = stats.linregress(x_arr, y_arr)
x_line = np.linspace(x_arr.min(), x_arr.max(), 200)
ax2.plot(x_line, intercept + slope * x_line, 'k-', linewidth=1.5, alpha=0.7,
         label=f'回帰線 (r={r_val:.3f}, p<0.001)')

ax2.set_xlabel('消費支出合計（円/月）', fontsize=12)
ax2.set_ylabel('食料費（円/月）', fontsize=12)
ax2.set_title('消費支出と食料費の関係（2022年、47都道府県）\n─ エンゲル係数的観点から地域間格差を検討 ─',
              fontsize=13, fontweight='bold', pad=10)
ax2.legend(fontsize=9, loc='upper left', framealpha=0.85, ncol=2)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2020_U5_4_fig2.png'), bbox_inches='tight')
plt.close(fig2)
print('[OK] fig2 saved')

# ════════════════════════════════════════════════════════════
# 図3: 相関ヒートマップ（Pearson相関係数）
# ════════════════════════════════════════════════════════════
corr_cols_raw = [
    '食料費（二人以上の世帯）',
    '食料費割合',
    '消費支出（二人以上の世帯）',
    '保健医療費（二人以上の世帯）',
    '合計特殊出生率',
    '高齢化率',
]
corr_labels = ['食料費', '食料費割合\n(エンゲル係数)', '消費支出', '保健医療費', '合計特殊\n出生率', '高齢化率']

df_corr = df22[corr_cols_raw].copy()
df_corr.columns = corr_labels
corr_mat = df_corr.corr()

fig3, ax3 = plt.subplots(figsize=(7, 6))
im = ax3.imshow(corr_mat.values, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')
plt.colorbar(im, ax=ax3, fraction=0.046, pad=0.04)

n = len(corr_labels)
ax3.set_xticks(range(n))
ax3.set_yticks(range(n))
ax3.set_xticklabels(corr_labels, fontsize=9)
ax3.set_yticklabels(corr_labels, fontsize=9)

for i in range(n):
    for j in range(n):
        val = corr_mat.values[i, j]
        color = 'white' if abs(val) > 0.6 else 'black'
        ax3.text(j, i, f'{val:.2f}', ha='center', va='center',
                 fontsize=10, fontweight='bold', color=color)

ax3.set_title('食費・健康関連変数の相関行列（Pearson、2022年47都道府県）',
              fontsize=12, fontweight='bold', pad=12)
plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2020_U5_4_fig3.png'), bbox_inches='tight')
plt.close(fig3)
print('[OK] fig3 saved')

# ════════════════════════════════════════════════════════════
# 図4: 標準化偏回帰係数プロット（OLS推定結果）
# ════════════════════════════════════════════════════════════
y_reg = df22['食料費割合']
X_reg = df22[['消費支出（二人以上の世帯）', '高齢化率', '保育所密度', '合計特殊出生率']].copy()

X_std = (X_reg - X_reg.mean()) / X_reg.std()
X_std = sm.add_constant(X_std)
model = sm.OLS(y_reg, X_std).fit()

coef_names = ['消費支出', '高齢化率', '保育所密度', '合計特殊出生率']
coefs   = model.params.values[1:]
ci_low  = model.conf_int().values[1:, 0]
ci_high = model.conf_int().values[1:, 1]
pvals   = model.pvalues.values[1:]

# p値で色分け
bar_colors = ['#e05c5c' if p < 0.05 else '#aaaaaa' for p in pvals]

fig4, ax4 = plt.subplots(figsize=(8, 5))
y_pos = np.arange(len(coef_names))
ax4.barh(y_pos, coefs, color=bar_colors, height=0.5, zorder=3)
ax4.errorbar(coefs, y_pos,
             xerr=[coefs - ci_low, ci_high - coefs],
             fmt='none', color='black', capsize=4, linewidth=1.5, zorder=4)
ax4.axvline(0, color='black', linewidth=1.0)
ax4.set_yticks(y_pos)
ax4.set_yticklabels(coef_names, fontsize=11)
ax4.set_xlabel('標準化偏回帰係数（β）', fontsize=12)
ax4.set_title(f'食料費割合を目的変数とする重回帰分析\n標準化偏回帰係数（R²={model.rsquared:.3f}, adj.R²={model.rsquared_adj:.3f}）',
              fontsize=12, fontweight='bold', pad=10)
ax4.grid(True, axis='x', alpha=0.3)

for i, (c, p) in enumerate(zip(coefs, pvals)):
    sig_label = '**' if p < 0.01 else ('*' if p < 0.05 else 'n.s.')
    offset = 0.02 if c >= 0 else -0.02
    ha = 'left' if c >= 0 else 'right'
    ax4.text(c + offset, i, sig_label, va='center', ha=ha, fontsize=11, fontweight='bold',
             color='#cc0000' if p < 0.05 else '#888888')

from matplotlib.patches import Patch
legend_items = [Patch(facecolor='#e05c5c', label='p < 0.05（有意）'),
                Patch(facecolor='#aaaaaa', label='n.s.（非有意）')]
ax4.legend(handles=legend_items, loc='lower right', fontsize=9)

plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2020_U5_4_fig4.png'), bbox_inches='tight')
plt.close(fig4)
print('[OK] fig4 saved')

# ════════════════════════════════════════════════════════════
# 統計値の出力
# ════════════════════════════════════════════════════════════
print('\n=== 基礎統計 ===')
print(f'全国平均食料費: {df22["食料費（二人以上の世帯）"].mean():,.0f}円/月')
print(f'全国平均食料費割合: {df22["食料費割合"].mean():.2f}%')
print(f'食料費 最大: {df22.loc[df22["食料費（二人以上の世帯）"].idxmax(), "都道府県"]} ({df22["食料費（二人以上の世帯）"].max():,.0f}円)')
print(f'食料費 最小: {df22.loc[df22["食料費（二人以上の世帯）"].idxmin(), "都道府県"]} ({df22["食料費（二人以上の世帯）"].min():,.0f}円)')
print(f'食料費 最大最小差: {df22["食料費（二人以上の世帯）"].max() - df22["食料費（二人以上の世帯）"].min():,.0f}円')
print(f'食料費割合 最大: {df22.loc[df22["食料費割合"].idxmax(), "都道府県"]} ({df22["食料費割合"].max():.1f}%)')
print(f'食料費割合 最小: {df22.loc[df22["食料費割合"].idxmin(), "都道府県"]} ({df22["食料費割合"].min():.1f}%)')

print('\n=== 相関係数 ===')
r_spend, p_spend = stats.pearsonr(df22['消費支出（二人以上の世帯）'], df22['食料費（二人以上の世帯）'])
r_health, p_health = stats.pearsonr(df22['食料費（二人以上の世帯）'], df22['保健医療費（二人以上の世帯）'])
r_tfr, p_tfr = stats.pearsonr(df22['食料費（二人以上の世帯）'], df22['合計特殊出生率'])
r_eng_tfr, p_eng_tfr = stats.pearsonr(df22['食料費割合'], df22['合計特殊出生率'])
print(f'消費支出 ↔ 食料費: r={r_spend:.3f}, p={p_spend:.4f}')
print(f'食料費 ↔ 保健医療費: r={r_health:.3f}, p={p_health:.4f}')
print(f'食料費 ↔ 合計特殊出生率: r={r_tfr:.3f}, p={p_tfr:.4f}')
print(f'食料費割合 ↔ 合計特殊出生率: r={r_eng_tfr:.3f}, p={p_eng_tfr:.4f}')

print('\n=== 重回帰分析 ===')
print(model.summary())

print('\n=== 地域別平均食料費 ===')
region_stats = df22.groupby('地域')['食料費（二人以上の世帯）'].agg(['mean','std','min','max'])
print(region_stats.round(0))