"""
教育用再現コード: 2024年 統計データ分析コンペティション 審査員奨励賞（大学生・一般）
=================================================================================
論文タイトル：中学生の言語による表現を巡る規定要因分析
              ―潜在意味解析とElastic Net回帰を用いた分析―
受賞区分  ：審査員奨励賞 ［大学生・一般の部］
著者      ：陣内未来、立山皓基（九州大学）

【分析概要】
  データ：SSDSE-B-2026.csv（47都道府県 × 2022年度）
  目的   ：SSDSE-B の社会指標群を「文書-単語行列」に見立て、
            LSA（TruncatedSVD）で都道府県を潜在空間に写像し、
            Elastic Net回帰で大学進学率（言語的学力の代理指標）の
            規定要因を特定する

  Step1. SSDSE-B-2026.csv から実データを読み込む（2022年度・47都道府県）
  Step2. 特徴量行列（TF-IDF アナログ）を構築し、StandardScaler で標準化
  Step3. LSA（TruncatedSVD, k=5）で低次元化
  Step4. ElasticNetCV で回帰・変数選択
  Step5. 予測精度の評価と可視化

【特徴量（TF-IDF アナログ, 17変数）】
  合計特殊出生率・転入出者数・年平均気温・年間降水量・小学校児童率
  中学校生徒率・大学等進学率・消費支出・食料費率・教育費率
  有効求人倍率・保育所密度・病院密度・高齢化率・住宅地価・宿泊者per capita

【目的変数】
  大学等進学率 = E4602 / E4601 × 100（言語的学力・アカデミック達成の代理指標）

【図の出力】
  html/figures/2024_U5_3_fig1_tfidf.png      ... スクリープロット + 成分負荷量
  html/figures/2024_U5_3_fig2_elasticnet.png ... CV スコア vs α
  html/figures/2024_U5_3_fig3_coef.png       ... Elastic Net 係数
  html/figures/2024_U5_3_fig4_scatter.png    ... 予測値 vs 実測値
=================================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2024_U5_3_shorei.py
# ============================================================


import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# ── パス設定 ─────────────────────────────────────────────────────────────────
BASE_DIR = os.path.join(_script_dir, '..')
FIG_DIR  = os.path.join(BASE_DIR, 'html', 'figures')
DATA_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'SSDSE-B-2026.csv')
os.makedirs(FIG_DIR, exist_ok=True)

plt.rcParams.update({
    'font.family':        'Hiragino Sans',
    'axes.unicode_minus': False,
    'figure.dpi':         150,
    'axes.spines.top':    False,
    'axes.spines.right':  False,
})

# ═══════════════════════════════════════════════════════════════════════════════
# ■ Step1. 実データ読み込み（SSDSE-B-2026, 2022年度・47都道府県）
# ═══════════════════════════════════════════════════════════════════════════════
print("=" * 60)
print("■ SSDSE-B-2026.csv 読み込み（2022年度）")

df_all = pd.read_csv(DATA_PATH, encoding='cp932', header=0, skiprows=[1])
df = df_all[df_all['SSDSE-B-2026'] == 2022].copy().reset_index(drop=True)

assert len(df) == 47, f"47都道府県分のデータが必要ですが {len(df)} 行しかありません"
print(f"  読み込み完了: {len(df)} 都道府県")

PREFS = list(df['Prefecture'])

# ── 特徴量エンジニアリング ────────────────────────────────────────────────────
# 各変数は「都道府県」という文書における「社会的単語頻度」に相当する

feat = pd.DataFrame(index=df.index)

feat['合計特殊出生率']   = df['A4103']                                     # TFR
feat['転入者数_pc']      = df['A5101'] / df['A1101'] * 1000                # 転入per千人
feat['転出者数_pc']      = df['A5102'] / df['A1101'] * 1000                # 転出per千人
feat['年平均気温']       = df['B4101']                                     # ℃
feat['年間降水量']       = df['B4109']                                     # mm
feat['小学校児童率']     = df['E2501'] / df['A1301'] * 100                 # 15歳以上人口比
feat['中学校生徒率']     = df['E3501'] / df['A1301'] * 100
feat['消費支出']         = df['L3221']                                     # 円/月
feat['食料費率']         = df['L322101'] / df['L3221'] * 100              # エンゲル係数相当
feat['教育費率']         = df['L322108'] / df['L3221'] * 100              # 教育費割合
feat['有効求人倍率']     = df['F3103'] / df['F3102']                      # 就職者数/求職者数
feat['保育所密度']       = df['J2503'] / df['A1101'] * 10000              # 保育所per万人
feat['病院密度']         = df['I510120'] / df['A1101'] * 10000            # 病院per万人
feat['高齢化率']         = df['A1303'] / df['A1101'] * 100               # 65歳以上割合
feat['住宅地価']         = df['C5401']                                    # 千円/m²
feat['宿泊者pc']         = df['G7101'] / df['A1101']                     # 宿泊者per人口

FEAT_NAMES = list(feat.columns)
print(f"  特徴量数: {len(FEAT_NAMES)}")
print(f"  特徴量: {FEAT_NAMES}")

# ── 目的変数：大学等進学率（言語的学力の代理指標） ──────────────────────────
# E4602: 大学等への進学者数, E4601: 中学校・中等教育学校後期課程 卒業者数
y_raw = df['E4602'] / df['E4601'] * 100   # 大学進学率 (%)
y = y_raw.values.astype(float)

print(f"  目的変数（大学進学率）: 平均={y.mean():.1f}%, 範囲=[{y.min():.1f}, {y.max():.1f}]%")

# ── 特徴量行列 ─────────────────────────────────────────────────────────────────
X_raw = feat.values.astype(float)
print(f"  特徴量行列サイズ: {X_raw.shape}  (47都道府県 × {len(FEAT_NAMES)}変数)")

# ═══════════════════════════════════════════════════════════════════════════════
# ■ Step2. 標準化 → LSA（TruncatedSVD）
#   StandardScaler で標準化した後、TF-IDF 行列の代わりとして SVD を適用
# ═══════════════════════════════════════════════════════════════════════════════
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)   # 47 × 16

n_components = 5
# random_state=42 は行列分解のアルゴリズム上の乱数種（データ生成ではない）
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_lsa = svd.fit_transform(X_scaled)      # 47 × 5

print("\n■ LSA（TruncatedSVD）")
print(f"  入力行列: {X_scaled.shape}")
print(f"  LSA後 (k={n_components}): {X_lsa.shape}")
print(f"  各成分の説明分散比: {svd.explained_variance_ratio_.round(3)}")
print(f"  累積説明分散比:      {svd.explained_variance_ratio_.cumsum().round(3)}")

lsa_names = [f'LSA成分{i+1}' for i in range(n_components)]

# ═══════════════════════════════════════════════════════════════════════════════
# ■ Step3. Elastic Net 回帰（目的変数：大学進学率）
# ═══════════════════════════════════════════════════════════════════════════════
print("\n■ ElasticNetCV（5-fold CV）")

l1_ratios  = [0.1, 0.5, 0.9, 1.0]
alphas_cv  = np.logspace(-3, 1, 60)

en_cv = ElasticNetCV(l1_ratio=l1_ratios, alphas=alphas_cv, cv=5, max_iter=10000)
en_cv.fit(X_lsa, y)

print(f"  最適α         = {en_cv.alpha_:.4f}")
print(f"  最適l1_ratio  = {en_cv.l1_ratio_:.2f}")

coef_df = pd.DataFrame({
    '成分':  lsa_names,
    '係数':  en_cv.coef_,
}).sort_values('係数', key=abs, ascending=False)

print("\n  【Elastic Net 係数（全成分）】")
print(coef_df.round(4).to_string(index=False))

y_pred = en_cv.predict(X_lsa)
r2   = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"\n  R² = {r2:.4f},  RMSE = {rmse:.4f} [%]")

# 正則化パス（最適 l1_ratio で固定）
enet_path_model = ElasticNet(l1_ratio=en_cv.l1_ratio_, max_iter=10000)
alphas_path = np.logspace(-3, 1, 80)
coefs_path = []
for a in alphas_path:
    enet_path_model.set_params(alpha=a)
    enet_path_model.fit(X_lsa, y)
    coefs_path.append(enet_path_model.coef_.copy())
coefs_path = np.array(coefs_path)   # 80 × 5

# ═══════════════════════════════════════════════════════════════════════════════
# ■ 図の生成（4枚）
# ═══════════════════════════════════════════════════════════════════════════════

# ────────────────────────────────────────────────────────────────────────────
# 図1：スクリープロット + 各成分の上位特徴量負荷量
# ────────────────────────────────────────────────────────────────────────────
print("\n図1: スクリープロット + 特徴量負荷量 を作成中...")

exp_var  = svd.explained_variance_ratio_
cum_var  = exp_var.cumsum()
comp_idx = np.arange(1, n_components + 1)

# 成分1・2 のローディング（絶対値上位8特徴量）
def top_loadings(comp_num, top_n=8):
    loadings = svd.components_[comp_num - 1]          # shape: (n_features,)
    top_idx  = np.argsort(np.abs(loadings))[::-1][:top_n]
    return [FEAT_NAMES[i] for i in top_idx], loadings[top_idx]

top_words1, top_vals1 = top_loadings(1, top_n=8)
top_words2, top_vals2 = top_loadings(2, top_n=8)

fig1, axes1 = plt.subplots(1, 3, figsize=(16, 5))
fig1.suptitle('LSA（潜在意味解析）: スクリープロットと特徴量負荷量', fontsize=13, fontweight='bold')

# スクリープロット
ax = axes1[0]
ax.bar(comp_idx, exp_var * 100, color='#1565C0', alpha=0.75, edgecolor='white', label='各成分')
ax2_twin = ax.twinx()
ax2_twin.plot(comp_idx, cum_var * 100, 'r-o', markersize=6, linewidth=2, label='累積')
ax2_twin.set_ylabel('累積説明分散比 (%)', fontsize=10, color='r')
ax2_twin.tick_params(axis='y', labelcolor='r')
ax2_twin.set_ylim(0, 105)
ax.set_xlabel('LSA 成分番号', fontsize=11)
ax.set_ylabel('説明分散比 (%)', fontsize=11)
ax.set_title('スクリープロット\n（各成分の寄与率）', fontsize=11, fontweight='bold')
ax.set_xticks(comp_idx)
ax.grid(axis='y', alpha=0.3)
for xi, ev in zip(comp_idx, exp_var):
    ax.text(xi, ev * 100 + 0.5, f'{ev*100:.1f}%', ha='center', va='bottom', fontsize=9)

# 成分1 負荷量
ax1b = axes1[1]
colors_b = ['#1565C0' if v >= 0 else '#C62828' for v in top_vals1]
ax1b.barh(top_words1[::-1], top_vals1[::-1], color=colors_b[::-1], edgecolor='white', alpha=0.85)
ax1b.axvline(0, color='black', linewidth=0.8)
ax1b.set_xlabel('負荷量', fontsize=11)
ax1b.set_title('LSA 第1成分 上位負荷量\n（青：正, 赤：負）', fontsize=11, fontweight='bold')
ax1b.grid(axis='x', alpha=0.3)

# 成分2 負荷量
ax1c = axes1[2]
colors_c = ['#1565C0' if v >= 0 else '#C62828' for v in top_vals2]
ax1c.barh(top_words2[::-1], top_vals2[::-1], color=colors_c[::-1], edgecolor='white', alpha=0.85)
ax1c.axvline(0, color='black', linewidth=0.8)
ax1c.set_xlabel('負荷量', fontsize=11)
ax1c.set_title('LSA 第2成分 上位負荷量\n（青：正, 赤：負）', fontsize=11, fontweight='bold')
ax1c.grid(axis='x', alpha=0.3)

plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2024_U5_3_fig1_tfidf.png'), bbox_inches='tight', dpi=150)
plt.close(fig1)
print("  → 2024_U5_3_fig1_tfidf.png 保存完了")

# ────────────────────────────────────────────────────────────────────────────
# 図2：正則化パスと CV スコア
# ────────────────────────────────────────────────────────────────────────────
print("図2: 正則化パスと CV スコアを作成中...")

fig2, axes2 = plt.subplots(1, 2, figsize=(13, 5))
fig2.suptitle('Elastic Net 正則化パスとクロスバリデーション（大学進学率）',
              fontsize=13, fontweight='bold')

# 正則化パス
ax2a = axes2[0]
colors_path = ['#1565C0', '#2E7D32', '#F57F17', '#6A1B9A', '#C62828']
for j in range(coefs_path.shape[1]):
    ax2a.semilogx(alphas_path, coefs_path[:, j],
                  alpha=0.8, linewidth=1.8, color=colors_path[j], label=lsa_names[j])
ax2a.axvline(en_cv.alpha_, color='black', linestyle='--', linewidth=2,
             label=f'最適α = {en_cv.alpha_:.4f}')
ax2a.set_xlabel('正則化パラメータ α（log scale）', fontsize=11)
ax2a.set_ylabel('係数値', fontsize=11)
ax2a.set_title(f'正則化パス (l1_ratio={en_cv.l1_ratio_:.2f})', fontsize=11, fontweight='bold')
ax2a.legend(fontsize=9, loc='upper right')
ax2a.grid(True, alpha=0.2)

# CV MSE vs α
ax2b = axes2[1]
mse_path = en_cv.mse_path_
if mse_path.ndim == 3:
    l1_idx = int(np.argmin(np.abs(np.array(l1_ratios) - en_cv.l1_ratio_)))
    cv_means = mse_path[l1_idx].mean(axis=-1)
    cv_stds  = mse_path[l1_idx].std(axis=-1)
    alphas_plot = en_cv.alphas_[l1_idx] if en_cv.alphas_.ndim > 1 else en_cv.alphas_
else:
    cv_means = mse_path.mean(axis=-1)
    cv_stds  = mse_path.std(axis=-1)
    alphas_plot = en_cv.alphas_

ax2b.semilogx(alphas_plot, cv_means, 'b-o', markersize=4, linewidth=1.8, label='CV-MSE（平均）')
ax2b.fill_between(alphas_plot, cv_means - cv_stds, cv_means + cv_stds,
                  alpha=0.2, color='blue', label='±1 SD')
ax2b.axvline(en_cv.alpha_, color='red', linestyle='--', linewidth=2,
             label=f'最適α = {en_cv.alpha_:.4f}')
ax2b.set_xlabel('α', fontsize=11)
ax2b.set_ylabel('CV-MSE', fontsize=11)
ax2b.set_title('クロスバリデーションスコア\n（5-fold CV）', fontsize=11, fontweight='bold')
ax2b.legend(fontsize=10)
ax2b.grid(True, alpha=0.2)

plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2024_U5_3_fig2_elasticnet.png'), bbox_inches='tight', dpi=150)
plt.close(fig2)
print("  → 2024_U5_3_fig2_elasticnet.png 保存完了")

# ────────────────────────────────────────────────────────────────────────────
# 図3：Elastic Net 係数（LSA 成分ごと）
# ────────────────────────────────────────────────────────────────────────────
print("図3: Elastic Net 係数棒グラフを作成中...")

coef_sorted = coef_df.sort_values('係数')

fig3, axes3 = plt.subplots(1, 2, figsize=(14, 5))
fig3.suptitle('Elastic Net モデル: LSA成分の係数と各成分の解釈',
              fontsize=13, fontweight='bold')

# 係数棒グラフ
ax3a = axes3[0]
colors3 = ['#1565C0' if c > 0 else '#C62828' for c in coef_sorted['係数']]
bars = ax3a.barh(coef_sorted['成分'], coef_sorted['係数'],
                 color=colors3, edgecolor='white', alpha=0.88)
ax3a.axvline(0, color='black', linewidth=1.0)
ax3a.set_xlabel('Elastic Net 係数', fontsize=12)
ax3a.set_title(
    f'LSA成分の係数\n（α={en_cv.alpha_:.4f}, l1_ratio={en_cv.l1_ratio_:.2f}, R²={r2:.3f}）',
    fontsize=11, fontweight='bold')
ax3a.grid(axis='x', alpha=0.3)
for bar, val in zip(ax3a.patches, coef_sorted['係数']):
    if abs(val) > 1e-6:
        x_pos = val + 0.005 * np.sign(val)
        ha = 'left' if val >= 0 else 'right'
        ax3a.text(x_pos, bar.get_y() + bar.get_height() / 2, f'{val:.3f}',
                  va='center', ha=ha, fontsize=10, color='#333333')

# 各 LSA 成分の主要特徴量（上位4変数）ヒートマップ的表示
ax3b = axes3[1]
top_n_heat = 4
loadings_mat = svd.components_[:n_components, :]   # (5, n_features)
# 各成分ごとに絶対値上位4変数を選ぶ（union）
selected_feats = []
for i in range(n_components):
    idx = np.argsort(np.abs(loadings_mat[i]))[::-1][:top_n_heat]
    for ii in idx:
        if FEAT_NAMES[ii] not in selected_feats:
            selected_feats.append(FEAT_NAMES[ii])

feat_idx = [FEAT_NAMES.index(f) for f in selected_feats]
heat_data = loadings_mat[:, feat_idx]   # (5, n_selected)

im = ax3b.imshow(heat_data, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
ax3b.set_xticks(range(len(selected_feats)))
ax3b.set_xticklabels(selected_feats, rotation=45, ha='right', fontsize=8)
ax3b.set_yticks(range(n_components))
ax3b.set_yticklabels(lsa_names, fontsize=10)
ax3b.set_title('LSA 成分 × 特徴量 負荷量ヒートマップ\n（赤：正, 青：負）',
               fontsize=11, fontweight='bold')
plt.colorbar(im, ax=ax3b, fraction=0.046, pad=0.04)
for i in range(n_components):
    for j in range(len(selected_feats)):
        ax3b.text(j, i, f'{heat_data[i, j]:.2f}', ha='center', va='center',
                  fontsize=7, color='black')

plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2024_U5_3_fig3_coef.png'), bbox_inches='tight', dpi=150)
plt.close(fig3)
print("  → 2024_U5_3_fig3_coef.png 保存完了")

# ────────────────────────────────────────────────────────────────────────────
# 図4：予測値 vs 実測値（大学進学率）
# ────────────────────────────────────────────────────────────────────────────
print("図4: 予測値 vs 実測値散布図を作成中...")

# 注目都道府県（大学進学率上位・下位）
y_series = pd.Series(y, index=PREFS)
highlight_prefs = list(y_series.nlargest(3).index) + list(y_series.nsmallest(2).index)

fig4, ax4 = plt.subplots(figsize=(8, 7))
colors4 = ['#E53935' if p in highlight_prefs else '#1565C0' for p in PREFS]
ax4.scatter(y, y_pred, c=colors4, s=70, alpha=0.85, zorder=3, edgecolors='white', linewidths=0.5)

lim_min = min(y.min(), y_pred.min()) - 2
lim_max = max(y.max(), y_pred.max()) + 2
ax4.plot([lim_min, lim_max], [lim_min, lim_max], 'k--', linewidth=1.5, label='完全一致線')

ax4.set_xlabel('実測値（大学進学率 %）', fontsize=12)
ax4.set_ylabel('予測値（Elastic Net）', fontsize=12)
ax4.set_title(
    f'予測値 vs 実測値（大学進学率）\nR² = {r2:.3f},  RMSE = {rmse:.2f} [%]',
    fontsize=13, fontweight='bold')
ax4.legend(fontsize=10)
ax4.grid(True, alpha=0.2)

# 注目都道府県にラベル
for pref in highlight_prefs:
    idx = PREFS.index(pref)
    ax4.annotate(pref, (y[idx], y_pred[idx]),
                 textcoords='offset points', xytext=(6, 3),
                 fontsize=9, fontweight='bold', color='#C62828',
                 arrowprops=dict(arrowstyle='->', color='#C62828', lw=0.8))

ax4.text(0.05, 0.95, f'R² = {r2:.3f}', transform=ax4.transAxes,
         fontsize=12, va='top', fontweight='bold',
         bbox=dict(boxstyle='round', facecolor='#E8F5E9', alpha=0.85))

plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2024_U5_3_fig4_scatter.png'), bbox_inches='tight', dpi=150)
plt.close(fig4)
print("  → 2024_U5_3_fig4_scatter.png 保存完了")

# ═══════════════════════════════════════════════════════════════════════════════
# ■ 主要知見サマリー
# ═══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("✓ 全図の生成完了（4枚）")
print("=" * 60)
print("\n【主要知見】")
print(f"  データ     : SSDSE-B-2026.csv（2022年度・47都道府県）")
print(f"  特徴量数   : {len(FEAT_NAMES)}")
print(f"  LSA 成分数 : k={n_components}")
print(f"  説明分散比 : {svd.explained_variance_ratio_.round(3)}")
print(f"  最適α      : {en_cv.alpha_:.4f}")
print(f"  最適l1_ratio: {en_cv.l1_ratio_:.2f}")
print(f"  モデル R²  : {r2:.3f}")
print(f"  RMSE       : {rmse:.2f} [%]")
print(f"\n  大学進学率 上位3県: {list(y_series.nlargest(3).index)}")
print(f"  大学進学率 下位2県: {list(y_series.nsmallest(2).index)}")
print(f"\n  第1成分 上位負荷量: {top_words1[:4]}")
print(f"  第2成分 上位負荷量: {top_words2[:4]}")