"""
教育用再現コード: 2023年 統計データ分析コンペティション 審査員奨励賞 [高校生の部]
=================================================================
論文タイトル：日本の食料自給率を上げるために

【分析概要】
  データ：SSDSE-B-2026.csv（都道府県データ）
          47都道府県 × 2012〜2023年（パネルデータ）

  分析1. 時系列分析（2012〜2023）
          全国平均の食料費・教育費・教養娯楽費の消費支出比率推移
  分析2. 地域間格差（2022年横断面分析）
          食料費割合（proxy指標）と地域特性の相関・重回帰分析
  分析3. 都道府県別 食料費割合ランキング

【目的変数】
  food_ratio = L322101（食料費） / L3221（消費支出） × 100（%）
  ※食料自給率の直接データはSSDSE-Bに含まれないため、
    「食料費が消費支出に占める割合」を食料依存度の代理変数として使用。
    食料費割合が高い地域 → 食料消費の相対的ウェイトが大きい

【説明変数（2022年横断面）】
  高齢化率     = A1303（65歳以上人口） / A1101（総人口） × 100
  消費支出     = L3221（消費支出・実数値、二人以上世帯）
  住宅地価格   = C5401（住宅地価格）
  気温         = B4101（年平均気温）
  降水量       = B4109（年間降水量）
  転出率       = A5102（転出者数） / A1101（総人口） × 100

【データ出典】
  SSDSE-B-2026.csv: 社会・人口統計体系 都道府県データ
  統計数理研究所 SSDSE（https://www.nstac.go.jp/use/literacy/ssdse/）

【データサイエンス学習ポイント】
  1. 代理変数（プロキシ変数）の考え方
  2. 時系列トレンドの可視化と解釈
  3. 横断面データの重回帰分析
  4. 地域統計データの特性（N=47の小サンプル）
=================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2023_H5_5_shorei.py
# ============================================================


import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ──────────────────────────────────────────────────────────────
# 共通設定
# ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

import os
FIG_DIR = 'html/figures'
DATA_DIR = 'data/raw'
os.makedirs(FIG_DIR, exist_ok=True)

# ================================================================
# ■ データ読み込み（SSDSE-B-2026: 都道府県データ）
# ================================================================
# header=0 → 1行目（コード名行）を列名として使用
df_raw = pd.read_csv(
    os.path.join(DATA_DIR, 'SSDSE-B-2026.csv'),
    encoding='cp932', header=0
)

# 都道府県コード（R01000〜R47000）の行のみ抽出
pref_mask = df_raw['Code'].str.match(r'^R\d{5}$', na=False)
df_pref = df_raw[pref_mask].copy().reset_index(drop=True)

year_col = df_pref.columns[0]  # 'SSDSE-B-2026'
df_pref[year_col] = pd.to_numeric(df_pref[year_col], errors='coerce')

# 数値変換
num_cols = ['L3221', 'L322101', 'L322108', 'L322109',
            'A1101', 'A1303', 'A5102',
            'L3221', 'C5401', 'B4101', 'B4109']
for c in num_cols:
    if c in df_pref.columns:
        df_pref[c] = pd.to_numeric(df_pref[c], errors='coerce')

print("=" * 60)
print("■ SSDSE-B 都道府県データ読み込み完了")
print(f"  全行数: {len(df_pref)}（47都道府県 × 12年）")
print(f"  年度: {sorted(df_pref[year_col].dropna().astype(int).unique())}")
print("=" * 60)

# ================================================================
# ■ 時系列分析（2012〜2023）
# ================================================================
print("\n■ 時系列分析：消費支出内訳の推移（全国平均）")

ts = df_pref.groupby(year_col)[['L3221', 'L322101', 'L322108', 'L322109']].mean()
ts['food_pct'] = ts['L322101'] / ts['L3221'] * 100    # 食料費割合（%）
ts['edu_pct']  = ts['L322108'] / ts['L3221'] * 100    # 教育費割合（%）
ts['ent_pct']  = ts['L322109'] / ts['L3221'] * 100    # 教養娯楽費割合（%）
ts = ts.sort_index()

print("\n  年度    食料費%   教育費%   教養娯楽費%")
print("  " + "-" * 38)
for yr, row in ts.iterrows():
    print(f"  {int(yr)}    {row['food_pct']:6.2f}    {row['edu_pct']:5.2f}    {row['ent_pct']:6.2f}")

# 時系列トレンド検定（Mann-Kendall 代替: ピアソン相関 vs 年）
years_arr = ts.index.values.astype(float)
r_food, p_food = stats.pearsonr(years_arr, ts['food_pct'].values)
r_edu,  p_edu  = stats.pearsonr(years_arr, ts['edu_pct'].values)
r_ent,  p_ent  = stats.pearsonr(years_arr, ts['ent_pct'].values)
print(f"\n  食料費割合 トレンド: r={r_food:.4f}, p={p_food:.4f}")
print(f"  教育費割合 トレンド: r={r_edu:.4f},  p={p_edu:.4f}")
print(f"  教養娯楽費 トレンド: r={r_ent:.4f},  p={p_ent:.4f}")

# ================================================================
# ■ 横断面分析（2022年 N=47都道府県）
# ================================================================
print("\n" + "=" * 60)
print("■ 横断面分析（2022年度）")
print("=" * 60)

df_2022 = df_pref[df_pref[year_col] == 2022].copy().reset_index(drop=True)
print(f"  対象: {len(df_2022)}都道府県（2022年度）")

# 目的変数：食料費割合（代理変数）
df_2022['food_ratio']   = df_2022['L322101'] / df_2022['L3221'] * 100
# 説明変数
df_2022['aging_rate']   = df_2022['A1303'] / df_2022['A1101'] * 100   # 高齢化率（%）
df_2022['outflow_rate'] = df_2022['A5102'] / df_2022['A1101'] * 100   # 転出率（%）

VAR_NAMES = ['高齢化率', '消費支出', '住宅地価格', '気温', '降水量', '転出率']
VAR_COLS  = ['aging_rate', 'L3221', 'C5401', 'B4101', 'B4109', 'outflow_rate']

y_vals = df_2022['food_ratio'].values
X_raw  = df_2022[VAR_COLS].values

# 欠損除外
valid = np.all(np.isfinite(X_raw), axis=1) & np.isfinite(y_vals)
X_raw  = X_raw[valid]
y_vals = y_vals[valid]
pref_names = df_2022.loc[valid, 'Prefecture'].values
N = len(y_vals)
print(f"  有効都道府県数: N={N}")

# 無相関検定
print("\n  無相関検定（ピアソン相関）")
print(f"  {'変数':<12} {'r':>8} {'p値':>10} {'有意':>6}")
print("  " + "-" * 40)
corrs_all = []
pvals_all = []
for name, col in zip(VAR_NAMES, X_raw.T):
    r, p = stats.pearsonr(col, y_vals)
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'n.s.'
    print(f"  {name:<12} {r:>8.4f} {p:>10.4f} {sig:>6}")
    corrs_all.append(r)
    pvals_all.append(p)

# 重回帰分析
print("\n■ 重回帰分析（全変数投入 OLS）")
X_sm = sm.add_constant(X_raw)
model_full = sm.OLS(y_vals, X_sm).fit()
print(f"  R² = {model_full.rsquared:.4f}, 調整R² = {model_full.rsquared_adj:.4f}")
print(f"  F統計量 p = {model_full.f_pvalue:.4f}")
print(model_full.summary2())

# ================================================================
# ■ 図の生成（4枚）
# ================================================================

# ────────────────────────────────────────────────────────────────
# 図1 (fig1_ts): 時系列推移（食料費・教育費・教養娯楽費の消費支出比）
# ────────────────────────────────────────────────────────────────
fig1, ax1 = plt.subplots(figsize=(10, 5))

years_plot = ts.index.astype(int)
ax1.plot(years_plot, ts['food_pct'].values, 'o-',
         color='#C62828', linewidth=2.2, markersize=6,
         label='食料費割合（%）')
ax1.plot(years_plot, ts['edu_pct'].values, 's--',
         color='#1565C0', linewidth=1.8, markersize=5,
         label='教育費割合（%）')
ax1.plot(years_plot, ts['ent_pct'].values, '^:',
         color='#2E7D32', linewidth=1.8, markersize=5,
         label='教養娯楽費割合（%）')

# コロナ禍の影響を示す帯
ax1.axvspan(2020, 2021, alpha=0.08, color='gray', label='コロナ禍 (2020-2021)')

# 近似直線（食料費）
z1 = np.polyfit(years_arr, ts['food_pct'].values, 1)
ax1.plot(years_plot, np.poly1d(z1)(years_arr), '--',
         color='#C62828', alpha=0.45, linewidth=1.2)

ax1.set_xlabel('年度', fontsize=12)
ax1.set_ylabel('消費支出に占める割合（%）', fontsize=12)
ax1.set_title('消費支出内訳の時系列推移（2012〜2023年 全国都道府県平均）\n'
              '食料費割合の上昇傾向に注目',
              fontsize=12, fontweight='bold')
ax1.set_xticks(years_plot)
ax1.set_xticklabels([str(y) for y in years_plot], rotation=45, fontsize=9)
ax1.legend(fontsize=10, loc='upper left')
ax1.grid(True, alpha=0.35)
ax1.annotate(f'r={r_food:.3f}\n(p={p_food:.3f})',
             xy=(2020, ts.loc[2020, 'food_pct']),
             xytext=(2017.5, ts['food_pct'].max() - 0.3),
             fontsize=9, color='#C62828',
             arrowprops=dict(arrowstyle='->', color='#C62828', alpha=0.6))

plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2023_H5_5_fig1_ts.png'), bbox_inches='tight', dpi=150)
plt.close(fig1)
print("\n図1保存: 2023_H5_5_fig1_ts.png")

# ────────────────────────────────────────────────────────────────
# 図2 (fig2_temp): 気温 vs 食料費割合（散布図）
# ────────────────────────────────────────────────────────────────
fig2, ax2 = plt.subplots(figsize=(9, 6))

temp_vals = X_raw[:, VAR_NAMES.index('気温')]
r_t, p_t = stats.pearsonr(temp_vals, y_vals)

ax2.scatter(temp_vals, y_vals, color='#1565C0', s=55, alpha=0.65, zorder=3, label='都道府県')

# 回帰直線
z2 = np.polyfit(temp_vals, y_vals, 1)
xs2 = np.linspace(temp_vals.min(), temp_vals.max(), 100)
ax2.plot(xs2, np.poly1d(z2)(xs2), 'r-', linewidth=1.8, alpha=0.75, label=f'近似直線 r={r_t:.3f}')

# 代表都道府県にラベル
highlight = {'北海道': '#C62828', '東京都': '#2E7D32', '沖縄県': '#FF8F00',
             '青森県': '#6A1B9A', '大阪府': '#1565C0'}
for i, pname in enumerate(pref_names):
    if pname in highlight:
        ax2.annotate(pname,
                     xy=(temp_vals[i], y_vals[i]),
                     xytext=(temp_vals[i] + 0.3, y_vals[i] + 0.05),
                     fontsize=9, color=highlight[pname],
                     arrowprops=dict(arrowstyle='->', color=highlight[pname], alpha=0.7))
        ax2.scatter([temp_vals[i]], [y_vals[i]],
                    color=highlight[pname], s=90, zorder=5)

ax2.set_xlabel('年平均気温（℃）', fontsize=12)
ax2.set_ylabel('食料費割合（食料費 / 消費支出 × 100, %）', fontsize=12)
ax2.set_title(f'気温と食料費割合の関係（2022年 N={N}都道府県）\n'
              f'気候条件と食消費の地域差',
              fontsize=12, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(True, alpha=0.3)

sig_text = f'r={r_t:.4f},  p={p_t:.4f}'
sig_text += '  (***)' if p_t < 0.001 else '  (**)' if p_t < 0.01 else '  (*)' if p_t < 0.05 else '  (n.s.)'
ax2.text(0.98, 0.04, sig_text, transform=ax2.transAxes,
         ha='right', fontsize=9.5, color='#333',
         bbox=dict(boxstyle='round,pad=0.3', facecolor='#FFF9C4', edgecolor='#F9A825', alpha=0.8))

plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2023_H5_5_fig2_temp.png'), bbox_inches='tight', dpi=150)
plt.close(fig2)
print("図2保存: 2023_H5_5_fig2_temp.png")

# ────────────────────────────────────────────────────────────────
# 図3 (fig3_land): 住宅地価格 vs 食料費割合（散布図）
# ────────────────────────────────────────────────────────────────
fig3, ax3 = plt.subplots(figsize=(9, 6))

land_vals = X_raw[:, VAR_NAMES.index('住宅地価格')]
r_l, p_l = stats.pearsonr(land_vals, y_vals)

ax3.scatter(land_vals / 1000, y_vals, color='#E65100', s=55, alpha=0.65, zorder=3, label='都道府県')

z3 = np.polyfit(land_vals, y_vals, 1)
xs3_raw = np.linspace(land_vals.min(), land_vals.max(), 100)
ax3.plot(xs3_raw / 1000, np.poly1d(z3)(xs3_raw), 'r-',
         linewidth=1.8, alpha=0.75, label=f'近似直線 r={r_l:.3f}')

# 代表都道府県ラベル
highlight2 = {'東京都': '#C62828', '大阪府': '#1565C0', '沖縄県': '#FF8F00',
              '北海道': '#2E7D32', '秋田県': '#6A1B9A'}
for i, pname in enumerate(pref_names):
    if pname in highlight2:
        ax3.annotate(pname,
                     xy=(land_vals[i] / 1000, y_vals[i]),
                     xytext=(land_vals[i] / 1000 + 5, y_vals[i] + 0.1),
                     fontsize=9, color=highlight2[pname],
                     arrowprops=dict(arrowstyle='->', color=highlight2[pname], alpha=0.7))
        ax3.scatter([land_vals[i] / 1000], [y_vals[i]],
                    color=highlight2[pname], s=90, zorder=5)

ax3.set_xlabel('住宅地平均価格（千円/m²）', fontsize=12)
ax3.set_ylabel('食料費割合（食料費 / 消費支出 × 100, %）', fontsize=12)
ax3.set_title(f'住宅地価格と食料費割合の関係（2022年 N={N}都道府県）\n'
              f'都市部・農村部での食消費パターンの違い',
              fontsize=12, fontweight='bold')
ax3.legend(fontsize=10)
ax3.grid(True, alpha=0.3)

sig_text3 = f'r={r_l:.4f},  p={p_l:.4f}'
sig_text3 += '  (***)' if p_l < 0.001 else '  (**)' if p_l < 0.01 else '  (*)' if p_l < 0.05 else '  (n.s.)'
ax3.text(0.98, 0.04, sig_text3, transform=ax3.transAxes,
         ha='right', fontsize=9.5, color='#333',
         bbox=dict(boxstyle='round,pad=0.3', facecolor='#FFF9C4', edgecolor='#F9A825', alpha=0.8))

plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2023_H5_5_fig3_land.png'), bbox_inches='tight', dpi=150)
plt.close(fig3)
print("図3保存: 2023_H5_5_fig3_land.png")

# ────────────────────────────────────────────────────────────────
# 図4 (fig4_rank): 都道府県別 食料費割合ランキング（横棒グラフ）
# ────────────────────────────────────────────────────────────────
fig4, ax4 = plt.subplots(figsize=(10, 8))

# 全47都道府県をソート、上位10件を強調
df_rank = pd.DataFrame({'Prefecture': pref_names, 'food_ratio': y_vals})
df_rank = df_rank.sort_values('food_ratio', ascending=True)  # 横棒グラフ用に昇順

# 全都道府県を薄く描き、上位10を強調
bar_colors = []
threshold_top10 = df_rank['food_ratio'].nlargest(10).min()
for val in df_rank['food_ratio']:
    if val >= threshold_top10:
        bar_colors.append('#C62828')   # 上位10: 赤
    else:
        bar_colors.append('#90CAF9')   # それ以外: 薄い青

bars = ax4.barh(df_rank['Prefecture'], df_rank['food_ratio'],
                color=bar_colors, alpha=0.8, edgecolor='white', height=0.7)

# 値ラベル（上位10のみ）
for bar, val, pname in zip(bars, df_rank['food_ratio'], df_rank['Prefecture']):
    if val >= threshold_top10:
        ax4.text(val + 0.05, bar.get_y() + bar.get_height() / 2,
                 f'{val:.1f}%', va='center', fontsize=8.5, fontweight='bold', color='#C62828')

# 全国平均線
national_mean = np.mean(y_vals)
ax4.axvline(national_mean, color='#FF8F00', linestyle='--', linewidth=1.8,
            label=f'全国平均 {national_mean:.1f}%', alpha=0.9)

ax4.set_xlabel('食料費割合（食料費 / 消費支出 × 100, %）', fontsize=11)
ax4.set_title('都道府県別 食料費割合（2022年）\n'
              '食料費割合が高い地域（赤）= 食料依存度が高い傾向',
              fontsize=12, fontweight='bold')
ax4.legend(fontsize=10)
ax4.grid(axis='x', alpha=0.3)
ax4.set_xlim(left=df_rank['food_ratio'].min() - 1)

# 凡例補足
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#C62828', alpha=0.8, label=f'上位10都道府県（≥{threshold_top10:.1f}%）'),
    Patch(facecolor='#90CAF9', alpha=0.8, label='その他'),
]
ax4.legend(handles=legend_elements + [plt.Line2D([0], [0], color='#FF8F00',
           linestyle='--', linewidth=1.8, label=f'全国平均 {national_mean:.1f}%')],
           fontsize=9, loc='lower right')

plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2023_H5_5_fig4_rank.png'), bbox_inches='tight', dpi=150)
plt.close(fig4)
print("図4保存: 2023_H5_5_fig4_rank.png")

print("\n" + "=" * 60)
print("全図の生成完了（4枚）")
print(f"  fig1_ts.png   : 消費支出内訳 時系列推移（2012〜2023）")
print(f"  fig2_temp.png : 気温 vs 食料費割合 散布図")
print(f"  fig3_land.png : 住宅地価格 vs 食料費割合 散布図")
print(f"  fig4_rank.png : 都道府県別 食料費割合 ランキング")
print(f"  保存先: {os.path.abspath(FIG_DIR)}")
print("=" * 60)

# 代理変数の注意書き
print("\n【注意】本分析について")
print("  食料自給率（カロリーベース）は農林水産省が公表しており、")
print("  SSDSE-Bには含まれていない。")
print("  本分析では「食料費 / 消費支出」を代理変数（proxy）として使用。")
print("  この指標は家計の食料依存度を示すが、")
print("  食料自給率（国内生産量 / 消費量）とは概念が異なる点に注意。")