"""
2018_H2_yushu.py
都道府県別消費格差の動態分析：ジニ係数の時系列変化と規定要因
2018年度 統計データ分析コンペティション 優秀賞（高校生部門）
教育用再現スクリプト — 実データ: SSDSE-B-2026.csv のみ使用
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2018_H2_yushu.py
# ============================================================


import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import statsmodels.api as sm
from scipy import stats

# ── フォント設定 ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

# ── パス設定 ──────────────────────────────────────────────────────────────────
FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ── データ読み込み ────────────────────────────────────────────────────────────
df_b = pd.read_csv(DATA_B, encoding='cp932', header=1)
df_b = df_b[df_b['地域コード'].str.match(r'^R\d{5}', na=False)].copy()
df_b['年度'] = df_b['年度'].astype(int)

print("=== 利用可能な列 ===")
print(df_b.columns.tolist())
print()

# ── 分析変数を構築 ────────────────────────────────────────────────────────────
cons_col = '消費支出（二人以上の世帯）'

# 有効求人倍率 = 月間有効求人数 / 月間有効求職者数
df_b['有効求人倍率'] = df_b['月間有効求人数（一般）'] / df_b['月間有効求職者数（一般）']

# 高齢化率 = 65歳以上人口 / 総人口 × 100
df_b['高齢化率'] = df_b['65歳以上人口'] / df_b['総人口'] * 100

# 大学進学率 = 高等学校卒業者のうち進学者数 / 高等学校卒業者数 × 100
df_b['大学進学率'] = df_b['高等学校卒業者のうち進学者数'] / df_b['高等学校卒業者数'] * 100

# 人口密度の代理変数 = 着工建築物数 / 総人口 × 10000（相対都市化指標）
# ※SSDSE-Bに面積データがないため、転入者数/総人口を都市集積度の代理とする
df_b['都市集積度'] = df_b['転入者数（日本人移動者）'] / df_b['総人口'] * 100

# 消費支出を万円単位に
df_b['消費支出万'] = df_b[cons_col] / 10000

print("=== データサマリー（主要変数） ===")
X_cols = ['有効求人倍率', '高齢化率', '大学進学率', '都市集積度']
print(df_b[['年度', '都道府県', cons_col] + X_cols].describe())
print()

# ── 地域ブロック定義 ──────────────────────────────────────────────────────────
# 都道府県名は「〇〇県」「〇〇府」「〇〇都」「北海道」形式
region_map_raw = {
    '北海道': '北海道・東北', '青森': '北海道・東北', '岩手': '北海道・東北', '宮城': '北海道・東北',
    '秋田': '北海道・東北', '山形': '北海道・東北', '福島': '北海道・東北',
    '茨城': '関東', '栃木': '関東', '群馬': '関東', '埼玉': '関東', '千葉': '関東', '東京': '関東', '神奈川': '関東',
    '新潟': '中部', '富山': '中部', '石川': '中部', '福井': '中部', '山梨': '中部',
    '長野': '中部', '岐阜': '中部', '静岡': '中部', '愛知': '中部',
    '三重': '近畿', '滋賀': '近畿', '京都': '近畿', '大阪': '近畿', '兵庫': '近畿', '奈良': '近畿', '和歌山': '近畿',
    '鳥取': '中国・四国', '島根': '中国・四国', '岡山': '中国・四国', '広島': '中国・四国',
    '山口': '中国・四国', '徳島': '中国・四国', '香川': '中国・四国', '愛媛': '中国・四国', '高知': '中国・四国',
    '福岡': '九州・沖縄', '佐賀': '九州・沖縄', '長崎': '九州・沖縄', '熊本': '九州・沖縄',
    '大分': '九州・沖縄', '宮崎': '九州・沖縄', '鹿児島': '九州・沖縄', '沖縄': '九州・沖縄'
}
region_colors = {
    '北海道・東北': '#4e9af1',
    '関東':        '#e05c5c',
    '中部':        '#f0a500',
    '近畿':        '#5cb85c',
    '中国・四国':  '#9b59b6',
    '九州・沖縄':  '#f39c12',
}


def get_region(pref_name):
    """都道府県名（〇〇県など）から地域ブロックを返す"""
    short = pref_name.replace('県', '').replace('府', '').replace('都', '').replace('道', '')
    return region_map_raw.get(short, '関東')


def short_name(pref_name):
    """都道府県名から短縮形（県・府・都・道を除去）"""
    return pref_name.replace('県', '').replace('府', '').replace('都', '').replace('道', '')


df_b['地域ブロック'] = df_b['都道府県'].apply(get_region)
df_b['都道府県短'] = df_b['都道府県'].apply(short_name)

# ── ジニ係数・変動係数の計算 ──────────────────────────────────────────────────


def gini(arr):
    v = np.sort(arr)
    n = len(v)
    idx = np.arange(1, n + 1)
    return (2 * np.sum(idx * v) / (n * np.sum(v))) - (n + 1) / n


gini_ts = df_b.groupby('年度')[cons_col].apply(lambda x: gini(x.dropna().values))
cv_ts   = df_b.groupby('年度')[cons_col].apply(lambda x: x.std() / x.mean())

print("=== ジニ係数の時系列 ===")
for yr, g in gini_ts.items():
    print(f"  {yr}: Gini={g:.4f}, CV={cv_ts[yr]:.4f}")
print()

# 最新年を特定（2022年まで＝SSDSE-Bの完全年次）
latest_year = 2022
df_latest = df_b[df_b['年度'] == latest_year].copy()

# ─────────────────────────────────────────────────────────────────────────────
# 図1: 都道府県別消費支出ランキング（最新年 2022年）横棒グラフ
# ─────────────────────────────────────────────────────────────────────────────
print("=== 図1: 都道府県別消費支出ランキング ===")

df1 = df_latest[['都道府県', '都道府県短', '地域ブロック', cons_col]].dropna().sort_values(cons_col)
nat_avg_latest = df1[cons_col].mean()

fig1, ax1 = plt.subplots(figsize=(11, 14))

colors_bar = [region_colors.get(r, '#999999') for r in df1['地域ブロック']]
bars1 = ax1.barh(df1['都道府県短'], df1[cons_col] / 10000,
                 color=colors_bar, alpha=0.85, edgecolor='white', linewidth=0.4)

ax1.axvline(nat_avg_latest / 10000, color='black', linewidth=2.0, linestyle='--',
            label=f'全国平均（{nat_avg_latest / 10000:.1f}万円）')

for bar, val in zip(bars1, df1[cons_col]):
    ax1.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height() / 2,
             f'{val / 10000:.1f}', va='center', fontsize=8)

region_patches1 = [mpatches.Patch(color=c, label=r, alpha=0.85)
                   for r, c in region_colors.items()]
nat_line1 = plt.Line2D([0], [0], color='black', linewidth=2.0, linestyle='--',
                       label=f'全国平均（{nat_avg_latest / 10000:.1f}万円）')
ax1.legend(handles=region_patches1 + [nat_line1], fontsize=9,
           loc='lower right', framealpha=0.9)

ax1.set_title(f'図1 都道府県別消費支出ランキング（{latest_year}年・地域色分け）',
              fontsize=13, fontweight='bold', pad=12)
ax1.set_xlabel('消費支出（万円/月・二人以上世帯）', fontsize=11)
ax1.set_xlim(0, df1[cons_col].max() / 10000 * 1.15)
ax1.grid(axis='x', alpha=0.3)
plt.tight_layout()
fig1_path = os.path.join(FIG_DIR, '2018_H2_fig1.png')
fig1.savefig(fig1_path, bbox_inches='tight')
plt.close(fig1)
print(f"saved: {fig1_path}")

print(f"  最高: {df1.iloc[-1]['都道府県']} ({df1.iloc[-1][cons_col]/10000:.1f}万円)")
print(f"  最低: {df1.iloc[0]['都道府県']} ({df1.iloc[0][cons_col]/10000:.1f}万円)")
print(f"  全国平均: {nat_avg_latest/10000:.1f}万円")
print()

# ─────────────────────────────────────────────────────────────────────────────
# 図2: 都道府県間消費格差の時系列（ジニ係数と変動係数の2軸折れ線）
# ─────────────────────────────────────────────────────────────────────────────
print("=== 図2: 消費格差の時系列 ===")

years_plot = [y for y in sorted(gini_ts.index) if 2012 <= y <= 2022]
g_vals = [gini_ts[y] for y in years_plot]
cv_vals = [cv_ts[y] for y in years_plot]

fig2, ax2a = plt.subplots(figsize=(11, 6))
ax2b = ax2a.twinx()

# COVID-19期間ハイライト（2020-2021）
ax2a.axvspan(2019.5, 2021.5, alpha=0.13, color='gray', label='COVID-19期間（2020-2021）')

line_g, = ax2a.plot(years_plot, g_vals, color='#1565C0', linewidth=2.5,
                    marker='o', markersize=7, label='ジニ係数（左軸）', zorder=4)
line_cv, = ax2b.plot(years_plot, cv_vals, color='#E65100', linewidth=2.5,
                     marker='s', markersize=7, linestyle='--',
                     label='変動係数 CV（右軸）', zorder=4)

# 値ラベル
for x, g, cv in zip(years_plot, g_vals, cv_vals):
    ax2a.annotate(f'{g:.4f}', (x, g), textcoords='offset points', xytext=(0, 8),
                  ha='center', fontsize=8, color='#1565C0')
    ax2b.annotate(f'{cv:.4f}', (x, cv), textcoords='offset points', xytext=(0, -14),
                  ha='center', fontsize=8, color='#E65100')

ax2a.set_ylabel('ジニ係数', fontsize=11, color='#1565C0')
ax2b.set_ylabel('変動係数（CV）', fontsize=11, color='#E65100')
ax2a.set_xlabel('年度', fontsize=11)
ax2a.tick_params(axis='y', labelcolor='#1565C0')
ax2b.tick_params(axis='y', labelcolor='#E65100')
ax2a.set_xticks(years_plot)
ax2a.set_xlim(2011.5, 2022.5)

covid_patch = mpatches.Patch(color='gray', alpha=0.3, label='COVID-19期間（2020-2021）')
ax2a.legend(handles=[line_g, line_cv, covid_patch], fontsize=10,
            loc='upper left', framealpha=0.9)

ax2a.set_title('図2 都道府県間消費格差の時系列変化（2012-2022年）',
               fontsize=13, fontweight='bold', pad=12)
ax2a.grid(axis='y', alpha=0.3)
plt.tight_layout()
fig2_path = os.path.join(FIG_DIR, '2018_H2_fig2.png')
fig2.savefig(fig2_path, bbox_inches='tight')
plt.close(fig2)
print(f"saved: {fig2_path}")
print(f"  ジニ係数最大: {max(g_vals):.4f} ({years_plot[g_vals.index(max(g_vals))]}年)")
print(f"  ジニ係数最小: {min(g_vals):.4f} ({years_plot[g_vals.index(min(g_vals))]}年)")
print()

# ─────────────────────────────────────────────────────────────────────────────
# 図3: 有効求人倍率 vs 消費支出 散布図（2022年断面・47都道府県ラベル付き）
# ─────────────────────────────────────────────────────────────────────────────
print("=== 図3: 有効求人倍率 vs 消費支出 散布図 ===")

df3 = df_latest[['都道府県', '都道府県短', '地域ブロック', cons_col, '有効求人倍率']].dropna().copy()

fig3, ax3 = plt.subplots(figsize=(12, 9))

for _, row in df3.iterrows():
    region = row['地域ブロック']
    color  = region_colors.get(region, '#999999')
    ax3.scatter(row['有効求人倍率'], row[cons_col] / 10000,
                color=color, s=65, alpha=0.85, zorder=3)
    ax3.annotate(row['都道府県短'],
                 xy=(row['有効求人倍率'], row[cons_col] / 10000),
                 xytext=(3, 3), textcoords='offset points',
                 fontsize=7.5, color='#333333')

# 回帰直線
x_vals = df3['有効求人倍率'].values
y_vals = df3[cons_col].values / 10000
slope3, intercept3, r3, p3, se3 = stats.linregress(x_vals, y_vals)
x_line3 = np.linspace(x_vals.min(), x_vals.max(), 100)
ax3.plot(x_line3, intercept3 + slope3 * x_line3, color='crimson', linewidth=2.2,
         zorder=4, label=f'回帰直線 (r={r3:.3f}, p={p3:.4f})')

region_patches3 = [mpatches.Patch(color=c, label=r, alpha=0.85)
                   for r, c in region_colors.items()]
reg_line3 = plt.Line2D([0], [0], color='crimson', linewidth=2.2,
                       label=f'回帰直線 r={r3:.3f}, p={p3:.4f}')
ax3.legend(handles=region_patches3 + [reg_line3], fontsize=9,
           loc='upper left', framealpha=0.9)

ax3.set_title(f'図3 有効求人倍率 vs 消費支出 都道府県別散布図（{latest_year}年）',
              fontsize=13, fontweight='bold', pad=12)
ax3.set_xlabel('有効求人倍率（倍）', fontsize=11)
ax3.set_ylabel('消費支出（万円/月・二人以上世帯）', fontsize=11)
ax3.grid(alpha=0.3)
plt.tight_layout()
fig3_path = os.path.join(FIG_DIR, '2018_H2_fig3.png')
fig3.savefig(fig3_path, bbox_inches='tight')
plt.close(fig3)
print(f"saved: {fig3_path}")
print(f"  有効求人倍率 vs 消費支出: r={r3:.3f}, p={p3:.6f}")
print()

# ─────────────────────────────────────────────────────────────────────────────
# 重回帰分析（OLS）: 2022年断面
# ─────────────────────────────────────────────────────────────────────────────
print("=== 重回帰分析（OLS） ===")

df_reg = df_latest[['都道府県', cons_col] + X_cols].dropna().copy()
y_reg  = df_reg[cons_col].values
X_reg  = df_reg[X_cols].values

# 標準化
y_std  = (y_reg - y_reg.mean()) / y_reg.std()
X_std  = (X_reg - X_reg.mean(axis=0)) / X_reg.std(axis=0)

X_sm   = sm.add_constant(X_std)
model  = sm.OLS(y_std, X_sm).fit()
print(model.summary())
print()

# 標準化偏回帰係数・95%CI・p値
coef_names = X_cols
coefs  = model.params[1:]
ci_raw = model.conf_int()
ci     = pd.DataFrame(ci_raw[1:], index=coef_names)
pvals  = model.pvalues[1:]

print("標準化偏回帰係数:")
for nm, coef, pv in zip(coef_names, coefs, pvals):
    star = '***' if pv < 0.001 else ('**' if pv < 0.01 else ('*' if pv < 0.05 else 'n.s.'))
    print(f"  {nm}: β={coef:.4f}, p={pv:.4f} {star}")
print()

# ─────────────────────────────────────────────────────────────────────────────
# 図4: 重回帰の標準化偏回帰係数（横棒グラフ + 95%CI + 有意マーク）
# ─────────────────────────────────────────────────────────────────────────────
print("=== 図4: 標準化偏回帰係数 ===")

var_labels4 = {
    '有効求人倍率': '有効求人倍率\n（倍）',
    '高齢化率':    '高齢化率\n（65歳以上%）',
    '大学進学率':  '大学進学率\n（高卒→大学%）',
    '都市集積度':  '都市集積度\n（転入率%）',
}

coef_arr  = np.array(coefs)
ci_lo     = ci.iloc[:, 0].values
ci_hi     = ci.iloc[:, 1].values
pvals_arr = np.array(pvals)

# 係数の大きい順にソート
sort_idx  = np.argsort(np.abs(coef_arr))[::-1]
sorted_names  = [coef_names[i] for i in sort_idx]
sorted_labels = [var_labels4.get(coef_names[i], coef_names[i]) for i in sort_idx]
sorted_coefs  = coef_arr[sort_idx]
sorted_cilo   = ci_lo[sort_idx]
sorted_cihi   = ci_hi[sort_idx]
sorted_pvals  = pvals_arr[sort_idx]

y_pos4 = np.arange(len(sorted_names))

fig4, ax4 = plt.subplots(figsize=(10, 6))

bar_colors4 = ['#1565C0' if c >= 0 else '#C62828' for c in sorted_coefs]
bars4 = ax4.barh(y_pos4, sorted_coefs, color=bar_colors4, alpha=0.80,
                 height=0.55, edgecolor='white')

# 95% CI エラーバー
xerr_lo = sorted_coefs - sorted_cilo
xerr_hi = sorted_cihi - sorted_coefs
ax4.errorbar(sorted_coefs, y_pos4, xerr=[xerr_lo, xerr_hi],
             fmt='none', color='black', capsize=5, linewidth=1.5, zorder=5)

# 有意マーク
for i, (coef, pv) in enumerate(zip(sorted_coefs, sorted_pvals)):
    star = '***' if pv < 0.001 else ('**' if pv < 0.01 else ('*' if pv < 0.05 else 'n.s.'))
    offset = 0.01 if coef >= 0 else -0.01
    ha = 'left' if coef >= 0 else 'right'
    ax4.text(sorted_cihi[i] + 0.02, i, star,
             va='center', ha='left', fontsize=10,
             color='#C62828' if pv < 0.05 else '#999999', fontweight='bold')

ax4.set_yticks(y_pos4)
ax4.set_yticklabels(sorted_labels, fontsize=10)
ax4.axvline(0, color='black', linewidth=1.0, linestyle='--')

r2_adj = model.rsquared_adj
ax4.set_title(f'図4 重回帰分析：標準化偏回帰係数と95%信頼区間（2022年断面）\n'
              f'Adj.R²={r2_adj:.3f}  N=47都道府県',
              fontsize=12, fontweight='bold', pad=12)
ax4.set_xlabel('標準化偏回帰係数（β）', fontsize=11)
ax4.grid(axis='x', alpha=0.3)

# 凡例
pos_patch = mpatches.Patch(color='#1565C0', alpha=0.8, label='正の効果')
neg_patch = mpatches.Patch(color='#C62828', alpha=0.8, label='負の効果')
ax4.legend(handles=[pos_patch, neg_patch], fontsize=9, loc='lower right')

plt.tight_layout()
fig4_path = os.path.join(FIG_DIR, '2018_H2_fig4.png')
fig4.savefig(fig4_path, bbox_inches='tight')
plt.close(fig4)
print(f"saved: {fig4_path}")

# ─────────────────────────────────────────────────────────────────────────────
# 結果サマリー
# ─────────────────────────────────────────────────────────────────────────────
print()
print("=== 結果サマリー ===")
print(f"分析期間: 2012-2022年, 47都道府県")
print()
print(f"[消費格差（ジニ係数）推移]")
for yr, g in zip(years_plot, g_vals):
    print(f"  {yr}: {g:.4f}")
print()
print(f"[消費支出ランキング上位5 ({latest_year}年)]")
top5_df = df1.tail(5)
for _, rr in top5_df.iloc[::-1].iterrows():
    print(f"  {rr['都道府県']}: {rr[cons_col]/10000:.1f}万円")
print()
print(f"[消費支出ランキング下位5 ({latest_year}年)]")
bot5_df = df1.head(5)
for _, rr in bot5_df.iterrows():
    print(f"  {rr['都道府県']}: {rr[cons_col]/10000:.1f}万円")
print()
print(f"[重回帰結果 Adj.R²={r2_adj:.3f}]")
for nm, coef, pv in zip(coef_names, coefs, pvals):
    star = '***' if pv < 0.001 else ('**' if pv < 0.01 else ('*' if pv < 0.05 else 'n.s.'))
    print(f"  {nm}: β={coef:.4f}, p={pv:.4f} {star}")
print()
print("DONE: 2018_H2_yushu")