"""
教育用再現コード: 2023年 統計データ分析コンペティション 審査員奨励賞 [高校生の部]
=================================================================
論文タイトル：海水温からの降水量予測を目指して

【分析概要】
  海水温データはSSDSE-Bに収録されていないため、年平均気温（B4101）を
  海水温の代理変数（proxy）として用いる。
  都道府県別の気温データから降水量を予測する時系列回帰・散布図分析を行う。

  データ：SSDSE-B-2026.csv（都道府県データ）
  目的変数：年降水量（B4109）
  説明変数：年平均気温（B4101）、最高気温（B4102）、最低気温（B4103）
            気温較差 = B4102 − B4103（気象変動性の代理変数）
  補助変数：降水日数（B4106）

  Step1. 断面分析：47都道府県 × 2022年 — 気温 vs 降水量 散布図
  Step2. 時系列分析：2012-2023年 全国平均気温・降水量の推移（2軸）
  Step3. パネル散布図：複数年を色分けで重ね合わせ
  Step4. パネル回帰：各年の β(気温→降水量) 係数の時系列

【変数コード（SSDSE-B）】
  B4101: 年平均気温（℃）
  B4102: 最高気温（日最高気温の月平均の最高値）（℃）
  B4103: 最低気温（日最低気温の月平均の最低値）（℃）
  B4106: 降水日数（年間）
  B4109: 年降水量（mm）

【データ出典】
  SSDSE-B-2026.csv: 社会・人口統計体系（都道府県データ）
  統計センターより公表の実データ
  https://www.nstac.go.jp/use/literacy/ssdse/

【データサイエンス学習ポイント】
  1. 代理変数（proxy variable）の考え方：観測できない変数の代替
  2. 断面データ vs パネルデータの分析の違い
  3. 2軸グラフ（twin axes）による時系列の可視化
  4. 回帰係数の安定性：年ごとのβの変動を観察する
=================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2023_H5_4_shorei.py
# ============================================================


import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import statsmodels.api as sm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ──────────────────────────────────────────────────────────────
# 共通設定
# ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

import os
FIG_DIR = 'html/figures'
DATA_DIR = 'data/raw'
os.makedirs(FIG_DIR, exist_ok=True)

# ================================================================
# ■ 実データ読み込み（SSDSE-B-2026: 都道府県データ）
# ================================================================
df_raw = pd.read_csv(
    os.path.join(DATA_DIR, 'SSDSE-B-2026.csv'),
    header=0,          # 行0がコード名（B4101等）
    encoding='cp932'
)

# 都道府県行を抽出（地域コードが R##### 形式）
mask_pref = df_raw['Code'].astype(str).str.match(r'^R\d{5}$', na=False)
df_b = df_raw[mask_pref].copy()

# 列名を分かりやすくリネーム
df_b = df_b.rename(columns={
    'SSDSE-B-2026': '年度',
    'Code':         '地域コード',
    'Prefecture':   '都道府県',
})

# 必要列を数値変換
needed_cols = ['B4101', 'B4102', 'B4103', 'B4106', 'B4109']
for c in needed_cols:
    df_b[c] = pd.to_numeric(df_b[c], errors='coerce')
df_b['年度'] = pd.to_numeric(df_b['年度'], errors='coerce')

# 気温較差（最高 − 最低）を計算
df_b['気温較差'] = df_b['B4102'] - df_b['B4103']

print("=" * 60)
print("■ SSDSE-B-2026 読み込み完了")
print(f"  全行数: {len(df_b)}, 年度: {sorted(df_b['年度'].dropna().unique().astype(int))}")
print(f"  2022年度の都道府県数: {(df_b['年度'] == 2022).sum()}")
print("=" * 60)

# ================================================================
# ■ 断面データ：2022年47都道府県
# ================================================================
df_2022 = df_b[df_b['年度'] == 2022][
    ['都道府県', '地域コード', 'B4101', 'B4102', 'B4103', '気温較差', 'B4106', 'B4109']
].dropna().reset_index(drop=True)

print(f"\n■ 2022年 断面データ")
print(f"  都道府県数: {len(df_2022)}")
print(df_2022[['都道府県', 'B4101', 'B4109']].describe().round(2))

# ピアソン相関
r_cs, p_cs = stats.pearsonr(df_2022['B4101'], df_2022['B4109'])
r_range, p_range = stats.pearsonr(df_2022['気温較差'], df_2022['B4109'])
print(f"\n  断面相関: 年平均気温 vs 年降水量 r={r_cs:.4f}, p={p_cs:.4f}")
print(f"  断面相関: 気温較差   vs 年降水量 r={r_range:.4f}, p={p_range:.4f}")

# OLS（断面）：降水量 ~ 気温 + 気温較差
X_cs = sm.add_constant(df_2022[['B4101', '気温較差']])
model_cs = sm.OLS(df_2022['B4109'], X_cs).fit()
print(f"\n  OLS（断面）: R²={model_cs.rsquared:.4f}, N={len(df_2022)}")
print(model_cs.summary2())

# ================================================================
# ■ パネルデータ：全年度
# ================================================================
df_all = df_b[['年度', '都道府県', '地域コード', 'B4101', 'B4109', '気温較差']].dropna().copy()
df_all = df_all[df_all['年度'].between(2012, 2023)]

print("\n■ パネルデータ（2012-2023年）")
print(f"  行数: {len(df_all)}")

# 全国平均（年ごと）
ts_mean = df_all.groupby('年度')[['B4101', 'B4109']].mean().reset_index()
ts_mean.columns = ['年度', '平均気温', '平均降水量']
ts_mean = ts_mean.sort_values('年度')
print("\n  年別全国平均:")
print(ts_mean.to_string(index=False))

# パネル回帰：各年に OLS → β(気温→降水量) を計算
years_for_reg = sorted(df_all['年度'].unique())
panel_results = []
for yr in years_for_reg:
    sub = df_all[df_all['年度'] == yr][['B4101', '気温較差', 'B4109']].dropna()
    if len(sub) < 10:
        continue
    X_yr = sm.add_constant(sub[['B4101', '気温較差']])
    m = sm.OLS(sub['B4109'], X_yr).fit()
    panel_results.append({
        '年度':      int(yr),
        'β_気温':    m.params['B4101'],
        'se_気温':   m.bse['B4101'],
        'p_気温':    m.pvalues['B4101'],
        'β_較差':    m.params['気温較差'],
        'se_較差':   m.bse['気温較差'],
        'R²':        m.rsquared,
        'N':         len(sub),
    })

df_panel = pd.DataFrame(panel_results)
print("\n■ 年別OLS回帰結果:")
print(df_panel[['年度', 'β_気温', 'se_気温', 'p_気温', 'R²', 'N']].to_string(index=False))

# ================================================================
# ■ 図の生成（4枚）
# ================================================================

YEARS_ALL = sorted(df_all['年度'].unique())
N_YEARS = len(YEARS_ALL)
cmap_year = cm.get_cmap('tab20', N_YEARS)

# ────────────────────────────────────────────────────────────────
# 図1: 散布図（2022年 47都道府県）年平均気温 vs 年降水量
# ────────────────────────────────────────────────────────────────
fig1, ax1 = plt.subplots(figsize=(9, 6))

scatter_colors = ['#1565C0'] * len(df_2022)

ax1.scatter(df_2022['B4101'], df_2022['B4109'],
            color='#1565C0', s=60, alpha=0.75, zorder=3, label='都道府県（2022年）')

# 回帰直線（単回帰）
x_cs = df_2022['B4101'].values
y_cs = df_2022['B4109'].values
z_cs = np.polyfit(x_cs, y_cs, 1)
xs_cs = np.linspace(x_cs.min(), x_cs.max(), 200)
ax1.plot(xs_cs, np.poly1d(z_cs)(xs_cs), 'r-', linewidth=2, alpha=0.8,
         label=f'回帰直線（r={r_cs:.3f}, p={p_cs:.3f}）')

# 都道府県ラベル（上位・下位5件）
top5_rain = df_2022.nlargest(3, 'B4109')
bot5_rain = df_2022.nsmallest(3, 'B4109')
for _, row in pd.concat([top5_rain, bot5_rain]).iterrows():
    ax1.annotate(
        row['都道府県'],
        xy=(row['B4101'], row['B4109']),
        xytext=(5, 5), textcoords='offset points',
        fontsize=8, color='#333'
    )

ax1.set_xlabel('年平均気温（℃）[B4101]', fontsize=12)
ax1.set_ylabel('年降水量（mm）[B4109]', fontsize=12)
ax1.set_title('年平均気温と年降水量の関係\n（2022年 47都道府県 SSDSE-B実データ）',
              fontsize=13, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

# 相関・回帰情報を枠内に表示
textstr = (f'r = {r_cs:.3f}\n'
           f'p = {p_cs:.4f}\n'
           f'N = {len(df_2022)}都道府県')
ax1.text(0.03, 0.97, textstr, transform=ax1.transAxes,
         fontsize=10, verticalalignment='top',
         bbox=dict(boxstyle='round,pad=0.4', facecolor='#E3F2FD', alpha=0.8))

plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2023_H5_4_fig1_scatter2022.png'), bbox_inches='tight', dpi=150)
plt.close(fig1)
print("\n図1保存: 2023_H5_4_fig1_scatter2022.png")

# ────────────────────────────────────────────────────────────────
# 図2: 時系列（2012-2023年）全国平均気温・降水量 — 2軸グラフ
# ────────────────────────────────────────────────────────────────
fig2, ax2a = plt.subplots(figsize=(10, 5))
ax2b = ax2a.twinx()

years_ts = ts_mean['年度'].values
temp_ts  = ts_mean['平均気温'].values
rain_ts  = ts_mean['平均降水量'].values

line_temp = ax2a.plot(years_ts, temp_ts, 'o-', color='#C62828', linewidth=2,
                      markersize=7, label='年平均気温（℃）', zorder=3)
line_rain = ax2b.plot(years_ts, rain_ts, 's--', color='#1565C0', linewidth=2,
                      markersize=7, label='年降水量（mm）', zorder=3)

ax2a.set_xlabel('年度', fontsize=12)
ax2a.set_ylabel('年平均気温（℃）', fontsize=12, color='#C62828')
ax2b.set_ylabel('年降水量（mm）', fontsize=12, color='#1565C0')
ax2a.tick_params(axis='y', labelcolor='#C62828')
ax2b.tick_params(axis='y', labelcolor='#1565C0')
ax2a.set_xticks(years_ts)
ax2a.set_xticklabels([str(int(y)) for y in years_ts], rotation=45, ha='right')

# 凡例を統合
lines = line_temp + line_rain
labels = [l.get_label() for l in lines]
ax2a.legend(lines, labels, loc='upper left', fontsize=10)

ax2a.set_title('全国平均気温と全国平均年降水量の時系列推移（2012-2023年）\n'
               '（47都道府県平均 SSDSE-B実データ）',
               fontsize=12, fontweight='bold')
ax2a.grid(True, alpha=0.3)

# ピアソン相関注記
r_ts, p_ts = stats.pearsonr(temp_ts, rain_ts)
ax2a.text(0.97, 0.05,
          f'気温×降水量 相関: r={r_ts:.3f} (p={p_ts:.3f})',
          transform=ax2a.transAxes, fontsize=9, ha='right',
          bbox=dict(boxstyle='round,pad=0.3', facecolor='#FFF9C4', alpha=0.8))

plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2023_H5_4_fig2_timeseries.png'), bbox_inches='tight', dpi=150)
plt.close(fig2)
print("図2保存: 2023_H5_4_fig2_timeseries.png")

# ────────────────────────────────────────────────────────────────
# 図3: パネル散布図 — 複数年 気温 vs 降水量（年ごとに色分け）
# ────────────────────────────────────────────────────────────────
fig3, ax3 = plt.subplots(figsize=(10, 7))

for i, yr in enumerate(YEARS_ALL):
    sub = df_all[df_all['年度'] == yr][['B4101', 'B4109']].dropna()
    c = cmap_year(i)
    ax3.scatter(sub['B4101'], sub['B4109'],
                color=c, s=30, alpha=0.65, zorder=2, label=str(int(yr)))

# 全データの回帰直線
x_all = df_all['B4101'].values
y_all = df_all['B4109'].values
z_all = np.polyfit(x_all, y_all, 1)
xs_all = np.linspace(x_all.min(), x_all.max(), 200)
ax3.plot(xs_all, np.poly1d(z_all)(xs_all),
         'k-', linewidth=2.5, alpha=0.7, zorder=4, label='全年度回帰直線')

r_all, p_all = stats.pearsonr(x_all, y_all)

ax3.set_xlabel('年平均気温（℃）[B4101]', fontsize=12)
ax3.set_ylabel('年降水量（mm）[B4109]', fontsize=12)
ax3.set_title('年平均気温と年降水量（パネル散布図 2012-2023年）\n'
              '（色 = 年度、各点 = 都道府県）',
              fontsize=12, fontweight='bold')

ax3.legend(loc='upper left', fontsize=8, ncol=3,
           title='年度', title_fontsize=8)
ax3.grid(True, alpha=0.3)

textstr3 = (f'全データ: r={r_all:.3f}, p<{max(p_all, 1e-4):.4f}\n'
            f'N={len(df_all)}（都道府県×年）')
ax3.text(0.97, 0.97, textstr3, transform=ax3.transAxes,
         fontsize=10, va='top', ha='right',
         bbox=dict(boxstyle='round,pad=0.4', facecolor='#E8F5E9', alpha=0.85))

plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2023_H5_4_fig3_panel_scatter.png'), bbox_inches='tight', dpi=150)
plt.close(fig3)
print("図3保存: 2023_H5_4_fig3_panel_scatter.png")

# ────────────────────────────────────────────────────────────────
# 図4: 回帰係数時系列 — β(気温→降水量) の年ごとの変動
# ────────────────────────────────────────────────────────────────
fig4, axes4 = plt.subplots(1, 2, figsize=(13, 5))
fig4.suptitle('年別OLS回帰：β(年平均気温→年降水量) の変動\n'
              '（各年 47都道府県クロスセクション SSDSE-B実データ）',
              fontsize=12, fontweight='bold')

# (a) β_気温 の時系列（エラーバー付き）
ax4a = axes4[0]
yrs_reg  = df_panel['年度'].values
betas    = df_panel['β_気温'].values
ses      = df_panel['se_気温'].values
pvals4   = df_panel['p_気温'].values

colors4 = ['#C62828' if p < 0.05 else '#9E9E9E' for p in pvals4]
ax4a.bar(yrs_reg, betas, color=colors4, alpha=0.7, width=0.6, zorder=2)
ax4a.errorbar(yrs_reg, betas, yerr=1.96 * ses,
              fmt='none', color='#333', capsize=4, linewidth=1.3, zorder=3)
ax4a.axhline(0, color='black', linewidth=1.0, linestyle='--', alpha=0.6)
ax4a.set_xlabel('年度', fontsize=11)
ax4a.set_ylabel('回帰係数 β（気温→降水量）', fontsize=11)
ax4a.set_title('β（気温）の年別推定値（±1.96SE）\n赤=p<0.05 有意', fontsize=11, fontweight='bold')
ax4a.set_xticks(yrs_reg)
ax4a.set_xticklabels([str(y) for y in yrs_reg], rotation=45, ha='right')
ax4a.grid(axis='y', alpha=0.3)

from matplotlib.patches import Patch
ax4a.legend(handles=[
    Patch(color='#C62828', alpha=0.7, label='p<0.05 有意'),
    Patch(color='#9E9E9E', alpha=0.7, label='n.s.')
], fontsize=9, loc='upper right')

# (b) R² の時系列
ax4b = axes4[1]
r2_vals = df_panel['R²'].values
ax4b.plot(yrs_reg, r2_vals, 'o-', color='#1565C0', linewidth=2, markersize=8, zorder=3)
ax4b.fill_between(yrs_reg, 0, r2_vals, color='#BBDEFB', alpha=0.4)
ax4b.set_xlabel('年度', fontsize=11)
ax4b.set_ylabel('決定係数 R²', fontsize=11)
ax4b.set_title('モデルの説明力（R²）の年別変動\n（降水量 ~ 気温 + 気温較差）',
               fontsize=11, fontweight='bold')
ax4b.set_xticks(yrs_reg)
ax4b.set_xticklabels([str(y) for y in yrs_reg], rotation=45, ha='right')
ax4b.set_ylim(0, 1)
ax4b.grid(axis='y', alpha=0.3)

# R² の平均値を注記
mean_r2 = r2_vals.mean()
ax4b.axhline(mean_r2, color='#C62828', linestyle='--', linewidth=1.5, alpha=0.7,
             label=f'平均 R²={mean_r2:.3f}')
ax4b.legend(fontsize=10)

plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2023_H5_4_fig4_beta_ts.png'), bbox_inches='tight', dpi=150)
plt.close(fig4)
print("図4保存: 2023_H5_4_fig4_beta_ts.png")

# ================================================================
# ■ 完了サマリ
# ================================================================
print("\n" + "=" * 60)
print("全図の生成完了（4枚）")
print("  fig1: 2023_H5_4_fig1_scatter2022.png   — 断面散布図（2022年）")
print("  fig2: 2023_H5_4_fig2_timeseries.png    — 時系列 2軸グラフ")
print("  fig3: 2023_H5_4_fig3_panel_scatter.png — パネル散布図（複数年）")
print("  fig4: 2023_H5_4_fig4_beta_ts.png       — 回帰係数の時系列")
print("=" * 60)
print("\n【代理変数について】")
print("  本コードでは「海水温」の代わりに「年平均気温（B4101）」を使用。")
print("  海水温と大気気温は密接に連動するため、代理変数として有効と考えられる。")
print("  海水温の実データ（例：気象庁 日本近海の海面水温データ）を")
print("  入手できれば、B4101 と置き換えることでより直接的な分析が可能。")