"""
2019_U3_suri.py
2019年度 統計数理賞（大学生・一般の部）
「地域イノベーション・知識産業と経済成長の関係：主成分分析と回帰」
手法: 主成分分析（PCA）、重回帰分析、Ward法クラスタリング
データ: SSDSE-B-2026.csv（47都道府県、2022年断面）
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2019_U3_suri.py
# ============================================================


import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ─────────────────────────────────────────
# 0. データ読み込み
# ─────────────────────────────────────────
df_b = pd.read_csv(DATA_B, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}', na=False)].copy()
df_b['年度'] = df_b['年度'].astype(int)

df2022 = df_b[df_b['年度'] == 2022].copy().reset_index(drop=True)
print(f"2022年断面データ: {len(df2022)} 都道府県")

# ─────────────────────────────────────────
# 1. 変数構築
# ─────────────────────────────────────────
# イノベーション・知識産業の代理変数（5変数）
df2022['大学進学率'] = (
    df2022['高等学校卒業者のうち進学者数'] / df2022['高等学校卒業者数']
)
df2022['大学学生割合'] = (
    df2022['大学学生数'] / df2022['総人口']
)  # 第三次産業（知識サービス）の代理
df2022['教育費'] = df2022['教育費（二人以上の世帯）']
df2022['転入率'] = (
    df2022['転入者数（日本人移動者）'] / df2022['総人口']
)
df2022['高校等数_万人'] = (
    df2022['高等学校数'] / df2022['総人口'] * 10000
)

# 経済指標（2変数）
df2022['消費支出'] = df2022['消費支出（二人以上の世帯）']
df2022['求人倍率'] = (
    df2022['月間有効求人数（一般）'] / df2022['月間有効求職者数（一般）']
)

# 地域区分
region_map = {
    '北海道': '北海道・東北', '青森県': '北海道・東北', '岩手県': '北海道・東北',
    '宮城県': '北海道・東北', '秋田県': '北海道・東北', '山形県': '北海道・東北',
    '福島県': '北海道・東北', '茨城県': '関東', '栃木県': '関東', '群馬県': '関東',
    '埼玉県': '関東', '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国・四国', '島根県': '中国・四国', '岡山県': '中国・四国',
    '広島県': '中国・四国', '山口県': '中国・四国', '徳島県': '中国・四国',
    '香川県': '中国・四国', '愛媛県': '中国・四国', '高知県': '中国・四国',
    '福岡県': '九州・沖縄', '佐賀県': '九州・沖縄', '長崎県': '九州・沖縄',
    '熊本県': '九州・沖縄', '大分県': '九州・沖縄', '宮崎県': '九州・沖縄',
    '鹿児島県': '九州・沖縄', '沖縄県': '九州・沖縄'
}
region_colors = {
    '北海道・東北': '#4e9af1', '関東': '#e05c5c', '中部': '#f0a500',
    '近畿': '#5cb85c', '中国・四国': '#9b59b6', '九州・沖縄': '#f39c12'
}

df2022['地域区分'] = df2022['都道府県'].map(region_map)

# ─────────────────────────────────────────
# 2. 主成分分析（PCA）
# ─────────────────────────────────────────
feature_cols = ['大学進学率', '大学学生割合', '教育費', '転入率', '高校等数_万人']
var_labels   = ['大学進学率', '大学学生割合\n（知識集積）', '教育費', '転入率', '高校等数\n/万人']

X = df2022[feature_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=3)
scores = pca.fit_transform(X_scaled)  # shape (47, 3)
loadings = pca.components_           # shape (3, 5)
explained_var = pca.explained_variance_ratio_

print("\n=== PCA 説明分散比 ===")
for i, ev in enumerate(explained_var):
    print(f"  PC{i+1}: {ev:.4f} ({ev*100:.1f}%)")
print(f"  PC1+PC2: {(explained_var[0]+explained_var[1])*100:.1f}%")

# 知識経済指数 = PC1スコア（符号調整：大学進学率がPC1正になるよう）
# PC1が大学進学率・大学学生割合で正→そのまま
if loadings[0, 0] < 0:   # 大学進学率の負荷量が負なら符号反転
    scores[:, 0] = -scores[:, 0]
    loadings[0, :] = -loadings[0, :]

df2022['PC1_知識経済指数'] = scores[:, 0]
df2022['PC2'] = scores[:, 1]

# ─────────────────────────────────────────
# 3. Ward法クラスタリング (k=4)
# ─────────────────────────────────────────
Z = linkage(X_scaled, method='ward')
labels_k4 = fcluster(Z, t=4, criterion='maxclust')
df2022['cluster'] = labels_k4

print("\n=== Ward法クラスタ（k=4）===")
for c in sorted(df2022['cluster'].unique()):
    prefs = df2022[df2022['cluster'] == c]['都道府県'].tolist()
    print(f"  クラスタ{c}: {prefs}")

# ─────────────────────────────────────────
# 4. 重回帰分析 (PC1, PC2 → 消費支出)
# ─────────────────────────────────────────
y_cons = df2022['消費支出'].values
X_reg  = sm.add_constant(np.column_stack([scores[:, 0], scores[:, 1]]))
model  = sm.OLS(y_cons, X_reg).fit()

print("\n=== 重回帰分析結果（PC1, PC2 → 消費支出）===")
print(model.summary())

# 標準化偏回帰係数（beta）
y_std = (y_cons - y_cons.mean()) / y_cons.std()
X_std = np.column_stack([
    (scores[:, 0] - scores[:, 0].mean()) / scores[:, 0].std(),
    (scores[:, 1] - scores[:, 1].mean()) / scores[:, 1].std()
])
X_std_wc = sm.add_constant(X_std)
model_std = sm.OLS(y_std, X_std_wc).fit()
beta_coefs = model_std.params[1:]   # PC1, PC2 の標準化偏回帰係数
beta_pvals = model_std.pvalues[1:]

print("\n=== 標準化偏回帰係数 ===")
print(f"  beta_PC1 = {beta_coefs[0]:.4f} (p={beta_pvals[0]:.4f})")
print(f"  beta_PC2 = {beta_coefs[1]:.4f} (p={beta_pvals[1]:.4f})")
print(f"  R2 = {model.rsquared:.4f}, R2adj = {model.rsquared_adj:.4f}")
print(f"  F = {model.fvalue:.4f}, p(F) = {model.f_pvalue:.4f}")

# 求人倍率との回帰も補足確認
X_reg2 = sm.add_constant(np.column_stack([scores[:, 0], scores[:, 1]]))
y_kyu   = df2022['求人倍率'].values
model2  = sm.OLS(y_kyu, X_reg2).fit()
print(f"\n=== PC → 求人倍率 R2 = {model2.rsquared:.4f} ===")

# ─────────────────────────────────────────
# 5. 図1: PCAバイプロット
# ─────────────────────────────────────────
fig, ax = plt.subplots(figsize=(9, 7))

for region, color in region_colors.items():
    mask = df2022['地域区分'] == region
    ax.scatter(scores[mask, 0], scores[mask, 1], c=color,
               label=region, s=60, zorder=3, alpha=0.85)

# 都道府県ラベル（全47）
for i, row in df2022.iterrows():
    ax.annotate(row['都道府県'], (scores[i, 0], scores[i, 1]),
                fontsize=7, ha='center', va='bottom', xytext=(0, 4),
                textcoords='offset points', color='#333333')

# 変数ベクトル（バイプロット）
scale = 2.5
for j, vl in enumerate(var_labels):
    ax.annotate('', xy=(loadings[0, j]*scale, loadings[1, j]*scale),
                xytext=(0, 0),
                arrowprops=dict(arrowstyle='->', color='#1565C0', lw=1.5))
    ax.text(loadings[0, j]*scale*1.1, loadings[1, j]*scale*1.1,
            vl, fontsize=8, color='#1565C0', ha='center', va='center')

ax.axhline(0, color='gray', lw=0.6, ls='--', alpha=0.5)
ax.axvline(0, color='gray', lw=0.6, ls='--', alpha=0.5)
ax.set_xlabel(f'PC1（説明分散比 {explained_var[0]*100:.1f}%）', fontsize=12)
ax.set_ylabel(f'PC2（説明分散比 {explained_var[1]*100:.1f}%）', fontsize=12)
ax.set_title('図1：PCAバイプロット（知識経済変数、2022年、47都道府県）', fontsize=13, fontweight='bold')
ax.legend(title='地域区分', loc='lower right', fontsize=9, title_fontsize=9)
ax.grid(True, alpha=0.3)

fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, '2019_U3_fig1.png'), dpi=150, bbox_inches='tight')
plt.close(fig)
print("\n[保存] 図1: PCAバイプロット")

# ─────────────────────────────────────────
# 6. 図2: 主成分負荷量ヒートマップ (PC1〜PC3)
# ─────────────────────────────────────────
fig, ax = plt.subplots(figsize=(8, 4))

load_matrix = loadings[:3, :]  # (3, 5)
short_labels = ['大学\n進学率', '大学学生\n割合', '教育費', '転入率', '高校等数\n/万人']
im = ax.imshow(load_matrix, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')
plt.colorbar(im, ax=ax, label='負荷量')

ax.set_xticks(range(len(short_labels)))
ax.set_xticklabels(short_labels, fontsize=10)
ax.set_yticks(range(3))
ax.set_yticklabels([f'PC{i+1}（{explained_var[i]*100:.1f}%）' for i in range(3)], fontsize=10)

for r in range(3):
    for c in range(len(short_labels)):
        val = load_matrix[r, c]
        ax.text(c, r, f'{val:.3f}', ha='center', va='center',
                fontsize=11, fontweight='bold',
                color='white' if abs(val) > 0.5 else '#222')

ax.set_title('図2：主成分負荷量ヒートマップ（PC1〜PC3）', fontsize=12, fontweight='bold')
fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, '2019_U3_fig2.png'), dpi=150, bbox_inches='tight')
plt.close(fig)
print("[保存] 図2: 負荷量ヒートマップ")

# ─────────────────────────────────────────
# 7. 図3: Ward法デンドログラム
# ─────────────────────────────────────────
fig, ax = plt.subplots(figsize=(14, 6))

# クラスタ4のカット高さを取得
sorted_z = np.sort(Z[:, 2])
cut_height = (sorted_z[-3] + sorted_z[-4]) / 2  # 4クラスタ境界

dendrogram(Z, labels=list(df2022['都道府県']), ax=ax,
           leaf_rotation=90, leaf_font_size=9,
           color_threshold=cut_height)
ax.axhline(cut_height, color='red', ls='--', lw=1.5, label=f'k=4 カット（高さ≈{cut_height:.2f}）')
ax.set_title('図3：Ward法デンドログラム（47都道府県、k=4）', fontsize=12, fontweight='bold')
ax.set_xlabel('都道府県', fontsize=10)
ax.set_ylabel('距離（Ward法）', fontsize=10)
ax.legend(fontsize=9)
fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, '2019_U3_fig3.png'), dpi=150, bbox_inches='tight')
plt.close(fig)
print("[保存] 図3: Wardデンドログラム")

# ─────────────────────────────────────────
# 8. 図4: 標準化偏回帰係数プロット
# ─────────────────────────────────────────
fig, ax = plt.subplots(figsize=(7, 4))

pred_names = ['PC1\n（知識経済指数）', 'PC2\n（教育費・転入）']
colors_bar = ['#1565C0' if p < 0.05 else '#aaaaaa' for p in beta_pvals]
bars = ax.barh(pred_names, beta_coefs, color=colors_bar, edgecolor='white', height=0.5)

# エラーバー（95%CI）
ci = model_std.conf_int()
if hasattr(ci, 'iloc'):
    ci_low  = ci.iloc[1:, 0].values
    ci_high = ci.iloc[1:, 1].values
else:
    ci_low  = ci[1:, 0]
    ci_high = ci[1:, 1]
for i, (lo, hi, val) in enumerate(zip(ci_low, ci_high, beta_coefs)):
    ax.plot([lo, hi], [i, i], color='#333', lw=2, zorder=5)
    ax.text(val + 0.02 if val >= 0 else val - 0.02,
            i, f'{val:.3f}\n(p={beta_pvals[i]:.3f})',
            va='center', ha='left' if val >= 0 else 'right',
            fontsize=9, color='#333')

ax.axvline(0, color='black', lw=1)
ax.set_xlabel('標準化偏回帰係数 (β)', fontsize=11)
ax.set_title('図4：標準化偏回帰係数（PC1, PC2 → 消費支出）', fontsize=12, fontweight='bold')
ax.set_xlim(-1.2, 1.2)

# 凡例
from matplotlib.patches import Patch
legend_els = [Patch(color='#1565C0', label='p<0.05（有意）'),
              Patch(color='#aaaaaa', label='p≥0.05（非有意）')]
ax.legend(handles=legend_els, fontsize=9, loc='lower right')
ax.grid(axis='x', alpha=0.3)

fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, '2019_U3_fig4.png'), dpi=150, bbox_inches='tight')
plt.close(fig)
print("[保存] 図4: 標準化偏回帰係数プロット")

# ─────────────────────────────────────────
# 9. 結果サマリ出力（HTML生成用）
# ─────────────────────────────────────────
print("\n" + "="*50)
print("=== HTML生成用 統計値サマリ ===")
print("="*50)
print(f"分析対象: 47都道府県（2022年断面）")
print(f"PC1 説明分散比: {explained_var[0]*100:.1f}%")
print(f"PC2 説明分散比: {explained_var[1]*100:.1f}%")
print(f"PC3 説明分散比: {explained_var[2]*100:.1f}%")
print(f"PC1+PC2 累積: {(explained_var[0]+explained_var[1])*100:.1f}%")
print(f"PC1+PC2+PC3 累積: {sum(explained_var[:3])*100:.1f}%")
print(f"重回帰 R2: {model.rsquared:.4f}")
print(f"重回帰 R2adj: {model.rsquared_adj:.4f}")
print(f"重回帰 F統計量: {model.fvalue:.2f}")
print(f"重回帰 p(F): {model.f_pvalue:.4f}")
print(f"beta_PC1: {beta_coefs[0]:.4f} (p={beta_pvals[0]:.4f})")
print(f"beta_PC2: {beta_coefs[1]:.4f} (p={beta_pvals[1]:.4f})")

# 負荷量
print("\n負荷量行列 PC1:")
for j, vl in enumerate(feature_cols):
    print(f"  {vl}: {loadings[0,j]:.4f}")
print("負荷量行列 PC2:")
for j, vl in enumerate(feature_cols):
    print(f"  {vl}: {loadings[1,j]:.4f}")

# クラスタ別都道府県
print("\nクラスタ別都道府県:")
for c in sorted(df2022['cluster'].unique()):
    prefs = df2022[df2022['cluster'] == c]['都道府県'].tolist()
    pc1_m = df2022[df2022['cluster'] == c]['PC1_知識経済指数'].mean()
    print(f"  クラスタ{c}（PC1平均={pc1_m:.2f}）: {', '.join(prefs)}")

print("\n図ファイル確認:")
for i in range(1, 5):
    fp = os.path.join(FIG_DIR, f'2019_U3_fig{i}.png')
    print(f"  fig{i}: {'OK' if os.path.exists(fp) else 'NG'} - {fp}")