"""
教育用再現コード: 2022年 統計データ分析コンペティション 審査員奨励賞 [高校生の部]
=================================================================
論文タイトル：気象条件と地域経済：降水量・気温が消費行動に与える影響
受賞：審査員奨励賞（高校生の部）

【分析概要】
  データ：SSDSE-B-2026.csv（都道府県別データ）
  目的変数：消費支出（二人以上の世帯）、食料費（二人以上の世帯）
  説明変数：年平均気温、降水量（年間）、最高気温、降水日数
  制御変数：高齢化率、総人口

  Step1. 気温の都道府県別分布（ランキング棒グラフ）
  Step2. 気温 vs 消費支出 散布図（2022年、地域別色分け）
  Step3. OLS重回帰：消費支出の気象要因（回帰係数プロット）
  Step4. 降水量 vs 食料費 散布図（2022年）

【変数】
  説明変数（気象）:
    年平均気温（℃）
    最高気温（日最高気温の月平均の最高値）（℃）
    降水日数（年間）（日）
    降水量（年間）（mm）
  目的変数:
    消費支出（二人以上の世帯）（円/月）
    食料費（二人以上の世帯）（円/月）
  制御変数:
    高齢化率（65歳以上人口 / 総人口 × 100）
    総人口（万人）

【データ出典】
  SSDSE-B-2026.csv: 社会・人口統計体系（都道府県データ）
  統計センターより公表の実データ

【データサイエンス学習ポイント】
  1. 気象データの統計的分析（外生性と気象変数の活用法）
  2. 単回帰 vs 重回帰（交絡変数の制御の必要性）
  3. 残差プロットによる仮定の検証
  4. 気象と消費の経済理論的背景
=================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2022_H5_3_shorei.py
# ============================================================


import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ──────────────────────────────────────────────────────────────
# 共通設定
# ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

import os
FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ================================================================
# ■ 実データ読み込み（SSDSE-B-2026: 都道府県データ）
# ================================================================
df_b = pd.read_csv(DATA_B, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}$', na=False)].copy()
df_b['年度'] = df_b['年度'].astype(int)

# 2022年度データを抽出
df = df_b[df_b['年度'] == 2022].copy().reset_index(drop=True)
print("=" * 65)
print(f"■ 対象都道府県数: N={len(df)}（2022年度 SSDSE-B）")
print("=" * 65)

# ── 変数生成 ─────────────────────────────────────────────────────
num_cols = [
    '年平均気温',
    '最高気温（日最高気温の月平均の最高値）',
    '降水日数（年間）',
    '降水量（年間）',
    '消費支出（二人以上の世帯）',
    '食料費（二人以上の世帯）',
    '総人口',
    '65歳以上人口',
]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df['高齢化率'] = df['65歳以上人口'] / df['総人口'] * 100
df['総人口_万人'] = df['総人口'] / 10000
df['消費支出_万円'] = df['消費支出（二人以上の世帯）'] / 10000
df['食料費_万円'] = df['食料費（二人以上の世帯）'] / 10000

# 地域区分マップ
region_map = {
    '北海道': '北海道・東北', '青森県': '北海道・東北', '岩手県': '北海道・東北',
    '宮城県': '北海道・東北', '秋田県': '北海道・東北', '山形県': '北海道・東北',
    '福島県': '北海道・東北',
    '茨城県': '関東', '栃木県': '関東', '群馬県': '関東', '埼玉県': '関東',
    '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国・四国', '島根県': '中国・四国', '岡山県': '中国・四国',
    '広島県': '中国・四国', '山口県': '中国・四国', '徳島県': '中国・四国',
    '香川県': '中国・四国', '愛媛県': '中国・四国', '高知県': '中国・四国',
    '福岡県': '九州・沖縄', '佐賀県': '九州・沖縄', '長崎県': '九州・沖縄',
    '熊本県': '九州・沖縄', '大分県': '九州・沖縄', '宮崎県': '九州・沖縄',
    '鹿児島県': '九州・沖縄', '沖縄県': '九州・沖縄',
}
region_colors = {
    '北海道・東北': '#4e9af1',
    '関東':        '#e05c5c',
    '中部':        '#f0a500',
    '近畿':        '#5cb85c',
    '中国・四国':  '#9b59b6',
    '九州・沖縄':  '#f39c12',
}
df['地域'] = df['都道府県'].map(region_map)

# 欠損除外
df_valid = df.dropna(subset=[
    '年平均気温', '最高気温（日最高気温の月平均の最高値）',
    '降水日数（年間）', '降水量（年間）',
    '消費支出（二人以上の世帯）', '食料費（二人以上の世帯）',
    '高齢化率', '総人口_万人'
]).copy().reset_index(drop=True)

N = len(df_valid)
print(f"有効都道府県数（欠損除外後）: N={N}")

# ── 基本統計量 ──────────────────────────────────────────────────
print()
print(df_valid[['年平均気温', '降水量（年間）', '消費支出_万円', '食料費_万円',
                '高齢化率', '総人口_万人']].describe().round(2))

# ── 相関分析 ───────────────────────────────────────────────────
print("\n" + "=" * 65)
print("■ 相関分析（気象変数 × 消費支出・食料費）")
print("=" * 65)
weather_vars = ['年平均気温', '最高気温（日最高気温の月平均の最高値）',
                '降水日数（年間）', '降水量（年間）']
target_vars = ['消費支出_万円', '食料費_万円']
print(f"\n  {'変数':<30} {'vs 消費支出':>12} {'vs 食料費':>12}")
print("  " + "-" * 58)
for wv in weather_vars:
    r1, p1 = stats.pearsonr(df_valid[wv], df_valid['消費支出_万円'])
    r2, p2 = stats.pearsonr(df_valid[wv], df_valid['食料費_万円'])
    s1 = '***' if p1 < 0.001 else '**' if p1 < 0.01 else '*' if p1 < 0.05 else 'n.s.'
    s2 = '***' if p2 < 0.001 else '**' if p2 < 0.01 else '*' if p2 < 0.05 else 'n.s.'
    print(f"  {wv:<30} r={r1:+.3f}{s1:>4}  r={r2:+.3f}{s2:>4}")

# ── OLS重回帰分析（消費支出） ─────────────────────────────────
print("\n" + "=" * 65)
print("■ OLS重回帰分析（目的変数: 消費支出）")
print("=" * 65)
xvars_names = ['年平均気温', '降水量（年間）', '高齢化率', '総人口_万人']
df_reg = df_valid[['消費支出_万円'] + xvars_names].dropna()
X_reg = sm.add_constant(df_reg[xvars_names])
res_ols = sm.OLS(df_reg['消費支出_万円'], X_reg).fit()
print(res_ols.summary())

# ================================================================
# ■ 図の生成（4枚）
# ================================================================

# ────────────────────────────────────────────────────────────────
# 図1: 年平均気温の都道府県別ランキング棒グラフ
# ────────────────────────────────────────────────────────────────
df_rank = df_valid[['都道府県', '年平均気温', '地域']].sort_values(
    '年平均気温', ascending=True).reset_index(drop=True)

fig1, ax1 = plt.subplots(figsize=(10, 10))
bar_colors = [region_colors.get(df_rank.loc[i, '地域'], '#888888') for i in df_rank.index]
bars = ax1.barh(range(len(df_rank)), df_rank['年平均気温'],
                color=bar_colors, alpha=0.85, edgecolor='white', height=0.7)
ax1.set_yticks(range(len(df_rank)))
ax1.set_yticklabels(df_rank['都道府県'], fontsize=9)
ax1.set_xlabel('年平均気温（℃）', fontsize=12)
ax1.set_title('年平均気温の都道府県別ランキング（2022年）\n'
              '（出所：SSDSE-B-2026、気象庁）', fontsize=13, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)
# 数値ラベル
for i, (idx, row) in enumerate(df_rank.iterrows()):
    ax1.text(row['年平均気温'] + 0.1, i, f"{row['年平均気温']:.1f}℃",
             va='center', ha='left', fontsize=7.5)
# 凡例
from matplotlib.patches import Patch
legend_handles = [Patch(color=c, alpha=0.85, label=r)
                  for r, c in region_colors.items()]
ax1.legend(handles=legend_handles, fontsize=8, loc='lower right',
           title='地域', title_fontsize=9)
plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2022_H5_3_fig1_temp_rank.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig1)
print("\n図1保存: 2022_H5_3_fig1_temp_rank.png")

# ────────────────────────────────────────────────────────────────
# 図2: 気温 vs 消費支出 散布図（2022年、地域別色分け）
# ────────────────────────────────────────────────────────────────
fig2, ax2 = plt.subplots(figsize=(9, 6))
for reg, grp in df_valid.groupby('地域'):
    ax2.scatter(grp['年平均気温'], grp['消費支出_万円'],
                color=region_colors.get(reg, '#888888'),
                label=reg, s=70, alpha=0.85, zorder=3)
# 都道府県名ラベル
for _, row in df_valid.iterrows():
    ax2.annotate(row['都道府県'][:2],
                 (row['年平均気温'], row['消費支出_万円']),
                 fontsize=6.5, alpha=0.75,
                 xytext=(3, 3), textcoords='offset points')
# 回帰直線
x2 = df_valid['年平均気温'].values
y2 = df_valid['消費支出_万円'].values
slope2, intercept2, r2, p2, _ = stats.linregress(x2, y2)
xr2 = np.linspace(x2.min(), x2.max(), 100)
ax2.plot(xr2, slope2 * xr2 + intercept2, 'k--', linewidth=1.5,
         label=f'回帰直線 (r={r2:.3f}, p={p2:.3f})')
ax2.set_xlabel('年平均気温（℃）', fontsize=12)
ax2.set_ylabel('消費支出（万円/月）', fontsize=12)
ax2.set_title('年平均気温 vs 消費支出（二人以上の世帯, 2022年）\n'
              '（地域別色分け、都道府県ラベル付き）',
              fontsize=12, fontweight='bold')
ax2.legend(fontsize=8, ncol=2, loc='upper right')
ax2.grid(alpha=0.3)
plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2022_H5_3_fig2_temp_consumption.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig2)
print("図2保存: 2022_H5_3_fig2_temp_consumption.png")

# ────────────────────────────────────────────────────────────────
# 図3: OLS回帰係数プロット（消費支出の気象要因）
# ────────────────────────────────────────────────────────────────
# 標準化係数（比較のため）
df_std = df_reg.copy()
for col in df_std.columns:
    if df_std[col].std() > 0:
        df_std[col] = (df_std[col] - df_std[col].mean()) / df_std[col].std()
X_std = sm.add_constant(df_std[xvars_names])
res_std = sm.OLS(df_std['消費支出_万円'], X_std).fit()

# プロット用ラベル
label_map = {
    '年平均気温':     '年平均気温（℃）',
    '降水量（年間）': '降水量・年間（mm）',
    '高齢化率':       '高齢化率（%）',
    '総人口_万人':    '総人口（万人）',
}
coefs_std = res_std.params.drop('const')
ses_std   = res_std.bse.drop('const')
pvals_std = res_std.pvalues.drop('const')
labels_std = [label_map.get(c, c) for c in coefs_std.index]
colors_std = ['#C62828' if p < 0.01 else '#FF8F00' if p < 0.05 else '#9E9E9E'
              for p in pvals_std]

fig3, ax3 = plt.subplots(figsize=(9, 5))
y_pos = np.arange(len(coefs_std))
ax3.barh(y_pos, coefs_std.values, color=colors_std, alpha=0.80,
         edgecolor='white', height=0.6)
ax3.errorbar(coefs_std.values, y_pos, xerr=1.96 * ses_std.values,
             fmt='none', color='#333', capsize=4, linewidth=1.2)
ax3.axvline(0, color='gray', linestyle='--', linewidth=1.0)
ax3.set_yticks(y_pos)
ax3.set_yticklabels(labels_std, fontsize=11)
ax3.set_xlabel('標準化回帰係数（±1.96SE）', fontsize=11)
ax3.set_title(f'消費支出の気象・社会要因 OLS重回帰（標準化係数）\n'
              f'R²={res_std.rsquared:.3f}, N={N}都道府県（2022年）',
              fontsize=12, fontweight='bold')
ax3.invert_yaxis()
ax3.grid(axis='x', alpha=0.3)
# p値の星マーク
for i, (coef, se, p) in enumerate(zip(coefs_std.values, ses_std.values, pvals_std)):
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
    xpos = coef + 1.96 * se + 0.02
    ax3.text(xpos, i, sig, va='center', ha='left', fontsize=11, color='#C62828')
legend_handles = [
    Patch(color='#C62828', alpha=0.80, label='p < 0.01'),
    Patch(color='#FF8F00', alpha=0.80, label='p < 0.05'),
    Patch(color='#9E9E9E', alpha=0.80, label='n.s.'),
]
ax3.legend(handles=legend_handles, fontsize=9, loc='lower right')
plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2022_H5_3_fig3_ols_coef.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig3)
print("図3保存: 2022_H5_3_fig3_ols_coef.png")

# ────────────────────────────────────────────────────────────────
# 図4: 降水量 vs 食料費 散布図（2022年）
# ────────────────────────────────────────────────────────────────
fig4, ax4 = plt.subplots(figsize=(9, 6))
for reg, grp in df_valid.groupby('地域'):
    ax4.scatter(grp['降水量（年間）'], grp['食料費_万円'],
                color=region_colors.get(reg, '#888888'),
                label=reg, s=70, alpha=0.85, zorder=3)
for _, row in df_valid.iterrows():
    ax4.annotate(row['都道府県'][:2],
                 (row['降水量（年間）'], row['食料費_万円']),
                 fontsize=6.5, alpha=0.75,
                 xytext=(3, 3), textcoords='offset points')
x4 = df_valid['降水量（年間）'].values
y4 = df_valid['食料費_万円'].values
slope4, intercept4, r4, p4, _ = stats.linregress(x4, y4)
xr4 = np.linspace(x4.min(), x4.max(), 100)
ax4.plot(xr4, slope4 * xr4 + intercept4, 'k--', linewidth=1.5,
         label=f'回帰直線 (r={r4:.3f}, p={p4:.3f})')
ax4.set_xlabel('降水量・年間（mm）', fontsize=12)
ax4.set_ylabel('食料費（万円/月）', fontsize=12)
ax4.set_title('降水量（年間）vs 食料費（二人以上の世帯, 2022年）\n'
              '（地域別色分け）',
              fontsize=12, fontweight='bold')
ax4.legend(fontsize=8, ncol=2, loc='upper right')
ax4.grid(alpha=0.3)
plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2022_H5_3_fig4_precip_food.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig4)
print("図4保存: 2022_H5_3_fig4_precip_food.png")

print("\n全図の生成完了（4枚）")
print("  fig1: 2022_H5_3_fig1_temp_rank.png       — 年平均気温ランキング棒グラフ")
print("  fig2: 2022_H5_3_fig2_temp_consumption.png — 気温 vs 消費支出 散布図")
print("  fig3: 2022_H5_3_fig3_ols_coef.png         — OLS回帰係数プロット")
print("  fig4: 2022_H5_3_fig4_precip_food.png      — 降水量 vs 食料費 散布図")