"""
2020年度 統計データ分析コンペティション
審査員奨励賞（高校生の部）
「過疎化・人口流出の要因と地域活性化策の分析」
使用データ: SSDSE-B-2026.csv（実公的データ）
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2020_H5_1_shorei.py
# ============================================================


import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats

plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ── データ読み込み ──────────────────────────────────────────────────────────
df_b = pd.read_csv(DATA_B, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}', na=False)].copy()
df_b['年度'] = df_b['年度'].astype(int)

# ── 地域マップ ──────────────────────────────────────────────────────────────
region_map = {
    '北海道': '北海道・東北', '青森県': '北海道・東北', '岩手県': '北海道・東北',
    '宮城県': '北海道・東北', '秋田県': '北海道・東北', '山形県': '北海道・東北',
    '福島県': '北海道・東北', '茨城県': '関東', '栃木県': '関東', '群馬県': '関東',
    '埼玉県': '関東', '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国・四国', '島根県': '中国・四国', '岡山県': '中国・四国',
    '広島県': '中国・四国', '山口県': '中国・四国', '徳島県': '中国・四国',
    '香川県': '中国・四国', '愛媛県': '中国・四国', '高知県': '中国・四国',
    '福岡県': '九州・沖縄', '佐賀県': '九州・沖縄', '長崎県': '九州・沖縄',
    '熊本県': '九州・沖縄', '大分県': '九州・沖縄', '宮崎県': '九州・沖縄',
    '鹿児島県': '九州・沖縄', '沖縄県': '九州・沖縄'
}
region_colors = {
    '北海道・東北': '#4e9af1', '関東': '#e05c5c', '中部': '#f0a500',
    '近畿': '#5cb85c', '中国・四国': '#9b59b6', '九州・沖縄': '#f39c12'
}

# ── 変数の計算 ───────────────────────────────────────────────────────────────
df_b['地域区分'] = df_b['都道府県'].map(region_map)

df_b['純移動率'] = (
    (df_b['転入者数（日本人移動者）'] - df_b['転出者数（日本人移動者）'])
    / df_b['総人口'] * 100
)
df_b['転出率'] = df_b['転出者数（日本人移動者）'] / df_b['総人口'] * 100
df_b['転入率'] = df_b['転入者数（日本人移動者）'] / df_b['総人口'] * 100
df_b['高齢化率'] = df_b['65歳以上人口'] / df_b['総人口'] * 100
df_b['求人倍率'] = df_b['月間有効求人数（一般）'] / df_b['月間有効求職者数（一般）']
df_b['消費支出万円'] = df_b['消費支出（二人以上の世帯）'] / 10000
df_b['大学進学率'] = (
    df_b['高等学校卒業者のうち進学者数'] / df_b['高等学校卒業者数'] * 100
)

# ── 2022年断面データ ─────────────────────────────────────────────────────────
df22 = df_b[df_b['年度'] == 2022].copy()

# ── OLS重回帰（標準化係数） ──────────────────────────────────────────────────
exog_cols = ['求人倍率', '消費支出万円', '高齢化率', '大学進学率']
X_raw = df22[exog_cols].copy()
X_std = (X_raw - X_raw.mean()) / X_raw.std()
y_ols = df22['純移動率']
X_const = sm.add_constant(X_std)
ols_model = sm.OLS(y_ols, X_const).fit()
print(ols_model.summary())

coefs  = ols_model.params[1:]
pvals  = ols_model.pvalues[1:]
conf   = ols_model.conf_int().iloc[1:]
r2     = ols_model.rsquared
adjr2  = ols_model.rsquared_adj
fstat  = ols_model.fvalue
fpval  = ols_model.f_pvalue

print(f"\nR² = {r2:.3f}, Adj.R² = {adjr2:.3f}")
print(f"F統計量 = {fstat:.2f}, p = {fpval:.2e}")

# 相関係数
for col in exog_cols:
    r, p = stats.pearsonr(df22[col].dropna(), y_ols[df22[col].dropna().index])
    print(f"{col}: r={r:.3f}, p={p:.4f}")

# ── 地域別時系列集計（2012〜2023） ──────────────────────────────────────────
region_ts = (
    df_b.groupby(['年度', '地域区分'])['純移動率']
    .mean()
    .reset_index()
)
years = sorted(df_b['年度'].unique())

# ────────────────────────────────────────────────────────────────────────────
# 図1: 地域別純移動率の時系列（2012〜2023）
# ────────────────────────────────────────────────────────────────────────────
fig1, ax1 = plt.subplots(figsize=(10, 5.5))

regions_ordered = ['関東', '近畿', '中部', '北海道・東北', '中国・四国', '九州・沖縄']
for reg in regions_ordered:
    subset = region_ts[region_ts['地域区分'] == reg]
    ax1.plot(
        subset['年度'], subset['純移動率'],
        color=region_colors[reg], linewidth=2.2,
        marker='o', markersize=5, label=reg
    )

ax1.axhline(0, color='black', linewidth=0.8, linestyle='--', alpha=0.5)
ax1.set_title('地域別 純移動率の推移（2012〜2023年）', fontsize=14, fontweight='bold', pad=12)
ax1.set_xlabel('年度', fontsize=11)
ax1.set_ylabel('純移動率（%）', fontsize=11)
ax1.set_xticks(years)
ax1.set_xticklabels([str(y) for y in years], rotation=45)
ax1.legend(loc='upper left', fontsize=10, framealpha=0.85)
ax1.grid(axis='y', alpha=0.3)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
fig1.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2020_H5_1_fig1.png'), dpi=150, bbox_inches='tight')
plt.close(fig1)
print("図1 保存完了")

# ────────────────────────────────────────────────────────────────────────────
# 図2: 都道府県別純移動率ランキング（2022年）
# ────────────────────────────────────────────────────────────────────────────
df22_sorted = df22.sort_values('純移動率')
colors_bar = [region_colors[region_map[pref]] for pref in df22_sorted['都道府県']]

fig2, ax2 = plt.subplots(figsize=(10, 11))
bars = ax2.barh(
    df22_sorted['都道府県'], df22_sorted['純移動率'],
    color=colors_bar, edgecolor='white', linewidth=0.5
)
ax2.axvline(0, color='black', linewidth=1.0)
ax2.set_title('都道府県別 純移動率ランキング（2022年）', fontsize=13, fontweight='bold', pad=10)
ax2.set_xlabel('純移動率（%）', fontsize=11)
ax2.tick_params(axis='y', labelsize=9)
ax2.grid(axis='x', alpha=0.3)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

# 凡例
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=region_colors[r], label=r) for r in region_colors]
ax2.legend(handles=legend_elements, loc='lower right', fontsize=9, framealpha=0.85)
fig2.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2020_H5_1_fig2.png'), dpi=150, bbox_inches='tight')
plt.close(fig2)
print("図2 保存完了")

# ────────────────────────────────────────────────────────────────────────────
# 図3: 求人倍率 vs 純移動率（散布図・回帰線・47都道府県ラベル）
# ────────────────────────────────────────────────────────────────────────────
fig3, ax3 = plt.subplots(figsize=(10, 7))

for _, row in df22.iterrows():
    c = region_colors[region_map.get(row['都道府県'], '関東')]
    ax3.scatter(row['求人倍率'], row['純移動率'], color=c, s=60, zorder=3, alpha=0.85)
    ax3.annotate(
        row['都道府県'].replace('県', '').replace('府', '').replace('都', '').replace('道', ''),
        (row['求人倍率'], row['純移動率']),
        fontsize=7.5, ha='center', va='bottom',
        xytext=(0, 4), textcoords='offset points', color='#333333'
    )

# 回帰線
x_plot = np.linspace(df22['求人倍率'].min() * 0.97, df22['求人倍率'].max() * 1.03, 200)
slope, intercept, r_val, p_val, _ = stats.linregress(df22['求人倍率'], df22['純移動率'])
ax3.plot(x_plot, intercept + slope * x_plot, color='#c0392b', linewidth=1.8,
         linestyle='--', zorder=2, label=f'回帰線 r={r_val:.3f}（p={p_val:.3f}）')

ax3.axhline(0, color='gray', linewidth=0.7, linestyle=':')
ax3.set_title('求人倍率と純移動率の関係（2022年・47都道府県）', fontsize=13, fontweight='bold', pad=12)
ax3.set_xlabel('求人倍率（月間有効求人数÷月間有効求職者数）', fontsize=11)
ax3.set_ylabel('純移動率（%）', fontsize=11)
ax3.legend(fontsize=10, loc='upper right')
ax3.grid(alpha=0.25)
ax3.spines['top'].set_visible(False)
ax3.spines['right'].set_visible(False)

# 地域凡例
from matplotlib.patches import Patch
leg_els = [Patch(facecolor=region_colors[r], label=r) for r in region_colors]
ax3.legend(handles=leg_els + [
    plt.Line2D([0], [0], color='#c0392b', linewidth=1.8, linestyle='--',
               label=f'回帰線 r={r_val:.3f}（p={p_val:.3f}）')
], fontsize=9, framealpha=0.85, loc='upper right')
fig3.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2020_H5_1_fig3.png'), dpi=150, bbox_inches='tight')
plt.close(fig3)
print("図3 保存完了")

# ────────────────────────────────────────────────────────────────────────────
# 図4: 標準化偏回帰係数プロット（OLS結果）
# ────────────────────────────────────────────────────────────────────────────
coef_labels = ['求人倍率', '消費支出\n（万円）', '高齢化率', '大学進学率']
coef_vals   = coefs.values
ci_low = conf.iloc[:, 0].values
ci_hi  = conf.iloc[:, 1].values
err_lo = coef_vals - ci_low
err_hi = ci_hi - coef_vals

bar_colors = ['#c0392b' if p < 0.05 else '#95a5a6' for p in pvals.values]

fig4, ax4 = plt.subplots(figsize=(7, 4.5))
y_pos = np.arange(len(coef_labels))
ax4.barh(y_pos, coef_vals, xerr=[err_lo, err_hi], color=bar_colors,
         edgecolor='white', linewidth=0.8, capsize=5, error_kw={'linewidth': 1.5})
ax4.axvline(0, color='black', linewidth=0.9)
ax4.set_yticks(y_pos)
ax4.set_yticklabels(coef_labels, fontsize=11)
ax4.set_xlabel('標準化偏回帰係数（95%CI）', fontsize=11)
ax4.set_title(f'純移動率の重回帰分析（標準化係数）\nR²={r2:.3f}, Adj.R²={adjr2:.3f}, p={fpval:.2e}',
              fontsize=12, fontweight='bold', pad=10)
ax4.grid(axis='x', alpha=0.3)
ax4.spines['top'].set_visible(False)
ax4.spines['right'].set_visible(False)

# 凡例（赤=有意、灰=非有意）
from matplotlib.patches import Patch
leg_sig = [Patch(facecolor='#c0392b', label='p < 0.05（有意）'),
           Patch(facecolor='#95a5a6', label='p >= 0.05（非有意）')]
ax4.legend(handles=leg_sig, fontsize=9, loc='lower right', framealpha=0.85)
fig4.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2020_H5_1_fig4.png'), dpi=150, bbox_inches='tight')
plt.close(fig4)
print("図4 保存完了")

# ── 統計値サマリー出力 ─────────────────────────────────────────────────────
print("\n===== 統計値サマリー =====")
print(f"分析対象: 47都道府県 × 2012〜2023年（12年間）")
print(f"2022年 純移動率 平均={df22['純移動率'].mean():.3f}%, SD={df22['純移動率'].std():.3f}%")
print(f"2022年 純移動率 最小={df22['純移動率'].min():.3f}% ({df22.loc[df22['純移動率'].idxmin(),'都道府県']})")
print(f"2022年 純移動率 最大={df22['純移動率'].max():.3f}% ({df22.loc[df22['純移動率'].idxmax(),'都道府県']})")
print(f"R²={r2:.3f}, Adj.R²={adjr2:.3f}, F={fstat:.2f}, p={fpval:.2e}")
for i, col in enumerate(exog_cols):
    sig = '*' if pvals.iloc[i] < 0.05 else ' '
    print(f"  {col}: β={coef_vals[i]:.4f}, p={pvals.iloc[i]:.4f} {sig}")
print("図4枚すべて保存完了")