"""
2022_H5_11_shorei.py
====================
教育用再現コード：2022年度 統計データ分析コンペティション 審査員奨励賞（高校生の部）
論文タイトル：空き家問題と人口減少：転出超過・高齢化が住宅に与える影響

分析概要
--------
空き家の直接データが SSDSE-B に含まれないため、以下の代理変数で分析する：

  着工新設住宅率   = 着工新設住宅戸数 / 総人口 × 10000
                    （新規建設の多い地域 → 既存住宅が余剰 → 空き家リスク増）
  転出超過率       = (転出者数 - 転入者数) / 総人口 × 1000
                    （転出超過が大きいほど空き家が増加しやすい）
  高齢化率         = 65歳以上人口 / 総人口 × 100
                    （高齢化が進むほど将来的な空き家予備軍が増える）
  住宅地地価       = 標準価格（平均価格）（住宅地）（円/m²）
  消費支出_log     = log(消費支出（二人以上の世帯）)
  求人倍率         = 月間有効求人数（一般）/ 月間有効求職者数（一般）

図の構成（4枚）
---------------
  Fig1: 転出超過率の時系列推移（東京・大阪・秋田・島根）
  Fig2: 高齢化率 vs 転出超過率 散布図（2022年, 都道府県ラベル付き）
  Fig3: OLS 回帰係数プロット（着工新設住宅率の決定要因）
  Fig4: 住宅地地価の都道府県別ランキング（2022年）

使用データ
----------
  SSDSE-B-2026.csv（都道府県別統計データ）
  出典：統計数理研究所 SSDSE（社会・人口統計体系データセット）

注意：合成データは一切使用しない。実公的データのみ。
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2022_H5_11_shorei.py
# ============================================================


import os
import warnings
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib import rcParams
import statsmodels.api as sm
from scipy import stats

warnings.filterwarnings('ignore')

# ── パス設定 ──────────────────────────────────────────────
FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'

os.makedirs(FIG_DIR, exist_ok=True)

# ── フォント設定 ───────────────────────────────────────────
rcParams['font.family'] = ['Hiragino Sans', 'Hiragino Kaku Gothic ProN',
                            'AppleGothic', 'Noto Sans CJK JP', 'sans-serif']
rcParams['axes.unicode_minus'] = False

# ── データ読み込み ────────────────────────────────────────
def load_ssdse_b(path):
    """SSDSE-B-2026.csv を読み込んで整形する。
    1行目がコード行、2行目が日本語ラベル行、3行目以降がデータ。
    """
    raw = pd.read_csv(path, header=0, encoding='cp932')
    # 行0はJapanese labels、行1以降が実データ
    data = raw.iloc[1:].copy().reset_index(drop=True)
    data.columns = raw.columns
    # 列名を日本語ラベルにリネーム
    label_row = raw.iloc[0]
    rename_map = {col: label_row[col] for col in raw.columns}
    data = data.rename(columns=rename_map)
    # 年度列を整数に
    data = data.rename(columns={'年度': 'year', '都道府県': 'pref', '地域コード': 'code'})
    data['year'] = data['year'].astype(int)
    # 数値列を変換
    for col in data.columns:
        if col not in ['year', 'pref', 'code']:
            data[col] = pd.to_numeric(data[col], errors='coerce')
    return data

print("データ読み込み中...")
df = load_ssdse_b(DATA_B)
print(f"  読み込み完了: {df.shape[0]}行 × {df.shape[1]}列")
print(f"  年度: {sorted(df['year'].unique())}")

# ── 派生変数の計算 ────────────────────────────────────────
# 着工新設住宅率（万人あたり）
df['着工新設住宅率'] = df['着工新設住宅戸数'] / df['総人口'] * 10000

# 転出超過率（千人あたり）
df['転出超過率'] = (df['転出者数（日本人移動者）'] - df['転入者数（日本人移動者）']) / df['総人口'] * 1000

# 高齢化率（%）
df['高齢化率'] = df['65歳以上人口'] / df['総人口'] * 100

# 消費支出（対数）
df['消費支出_log'] = np.log(df['消費支出（二人以上の世帯）'])

# 求人倍率
df['求人倍率'] = df['月間有効求人数（一般）'] / df['月間有効求職者数（一般）']

print("派生変数計算完了。")

# ── 2022年データ抽出 ──────────────────────────────────────
df2022 = df[df['year'] == 2022].copy().reset_index(drop=True)
print(f"  2022年データ: {len(df2022)}都道府県")

# ============================================================
# Fig1: 転出超過率の時系列推移（代表4都道府県）
# ============================================================
print("\nFig1: 転出超過率の時系列推移...")

TARGET_PREFS = ['東京都', '大阪府', '秋田県', '島根県']
COLORS = ['#E53935', '#1565C0', '#43A047', '#7B1FA2']
MARKERS = ['o', 's', '^', 'D']

fig1, ax1 = plt.subplots(figsize=(9, 5.5), dpi=150)

for pref, color, marker in zip(TARGET_PREFS, COLORS, MARKERS):
    pdata = df[df['pref'] == pref].sort_values('year')
    ax1.plot(pdata['year'], pdata['転出超過率'],
             color=color, marker=marker, markersize=5,
             linewidth=2, label=pref)

ax1.axhline(0, color='gray', linewidth=0.8, linestyle='--', alpha=0.7)
ax1.set_xlabel('年度', fontsize=12)
ax1.set_ylabel('転出超過率（‰、千人あたり）', fontsize=12)
ax1.set_title('Fig1: 転出超過率の時系列推移\n（東京・大阪・秋田・島根）', fontsize=13, fontweight='bold')
ax1.legend(loc='upper left', fontsize=11)
ax1.xaxis.set_major_locator(mticker.MultipleLocator(2))
ax1.xaxis.set_major_formatter(mticker.FormatStrFormatter('%d'))
ax1.grid(axis='y', alpha=0.3)

# 正負の意味を注釈
ax1.annotate('正値 = 転出超過（人口流出）', xy=(0.02, 0.97),
             xycoords='axes fraction', fontsize=9, color='gray',
             va='top')

fig1.tight_layout()
fig1_path = os.path.join(FIG_DIR, '2022_H5_11_fig1_netout_timeseries.png')
fig1.savefig(fig1_path, dpi=150, bbox_inches='tight')
plt.close(fig1)
print(f"  保存: {fig1_path}")

# ============================================================
# Fig2: 高齢化率 vs 転出超過率 散布図（2022年）
# ============================================================
print("Fig2: 高齢化率 vs 転出超過率 散布図...")

d2 = df2022[['pref', '高齢化率', '転出超過率']].dropna()

fig2, ax2 = plt.subplots(figsize=(10, 7.5), dpi=150)

# 転出超過率の符号で色分け
colors2 = ['#E53935' if v > 0 else '#1565C0' for v in d2['転出超過率']]
ax2.scatter(d2['高齢化率'], d2['転出超過率'],
            c=colors2, s=55, alpha=0.8, zorder=3)

# 都道府県ラベル（重要なもの）
highlight = ['東京都', '大阪府', '秋田県', '島根県', '鳥取県', '高知県',
             '沖縄県', '愛知県', '神奈川県', '北海道', '宮城県', '福島県']
for _, row in d2.iterrows():
    if row['pref'] in highlight:
        offset_x = 0.2
        offset_y = 0.05
        ax2.annotate(row['pref'],
                     xy=(row['高齢化率'], row['転出超過率']),
                     xytext=(row['高齢化率'] + offset_x, row['転出超過率'] + offset_y),
                     fontsize=8, color='#333333',
                     arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))
    else:
        ax2.annotate(row['pref'],
                     xy=(row['高齢化率'], row['転出超過率']),
                     fontsize=7, color='#555555',
                     ha='center', va='bottom')

# 回帰直線
x2 = d2['高齢化率'].values
y2 = d2['転出超過率'].values
slope, intercept, r_val, p_val, _ = stats.linregress(x2, y2)
x_line = np.linspace(x2.min(), x2.max(), 100)
ax2.plot(x_line, intercept + slope * x_line,
         color='#F57C00', linewidth=2, linestyle='--',
         label=f'回帰直線 (r={r_val:.2f}, p={p_val:.3f})')

ax2.axhline(0, color='gray', linewidth=0.8, linestyle=':', alpha=0.7)
ax2.set_xlabel('高齢化率（%）', fontsize=12)
ax2.set_ylabel('転出超過率（‰）', fontsize=12)
ax2.set_title('Fig2: 高齢化率 vs 転出超過率（2022年、都道府県別）', fontsize=13, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(alpha=0.25)

# 凡例の追加（色の意味）
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#E53935',
           markersize=9, label='転出超過（流出）'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#1565C0',
           markersize=9, label='転入超過（流入）'),
]
ax2.legend(handles=legend_elements + [
    Line2D([0], [0], color='#F57C00', linewidth=2, linestyle='--',
           label=f'回帰直線 (r={r_val:.2f}, p={p_val:.3f})')
], fontsize=9, loc='upper left')

fig2.tight_layout()
fig2_path = os.path.join(FIG_DIR, '2022_H5_11_fig2_aging_scatter.png')
fig2.savefig(fig2_path, dpi=150, bbox_inches='tight')
plt.close(fig2)
print(f"  保存: {fig2_path}")

# ============================================================
# Fig3: OLS 回帰係数プロット（着工新設住宅率の決定要因）
# ============================================================
print("Fig3: OLS 回帰係数プロット...")

# 説明変数の選択
REG_VARS = ['転出超過率', '高齢化率', '消費支出_log', '求人倍率']
REG_LABELS = ['転出超過率\n（‰）', '高齢化率\n（%）', '消費支出\n（対数）', '求人倍率']
TARGET = '着工新設住宅率'

d3 = df2022[REG_VARS + [TARGET]].dropna()
print(f"  回帰データ: {len(d3)}都道府県")

# 標準化（係数比較のため）
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X3_scaled = scaler.fit_transform(d3[REG_VARS])
y3 = d3[TARGET].values

X3_sm = sm.add_constant(X3_scaled)
model3 = sm.OLS(y3, X3_sm).fit()
print(model3.summary())

coefs = model3.params[1:]  # 定数項を除く
conf_int = model3.conf_int()[1:]
p_values = model3.pvalues[1:]

fig3, ax3 = plt.subplots(figsize=(8, 5), dpi=150)

y_pos = np.arange(len(REG_VARS))
bar_colors = ['#E53935' if p < 0.05 else '#90A4AE' for p in p_values]
bars = ax3.barh(y_pos, coefs, color=bar_colors, alpha=0.85, height=0.55)

# 信頼区間のエラーバー
for i, (ci_low, ci_high) in enumerate(conf_int):
    ax3.plot([ci_low, ci_high], [i, i], color='#212121', linewidth=2.5, zorder=5)
    ax3.plot([ci_low, ci_low], [i - 0.18, i + 0.18], color='#212121', linewidth=1.5, zorder=5)
    ax3.plot([ci_high, ci_high], [i - 0.18, i + 0.18], color='#212121', linewidth=1.5, zorder=5)

ax3.axvline(0, color='gray', linewidth=0.8, linestyle='--')
ax3.set_yticks(y_pos)
ax3.set_yticklabels(REG_LABELS, fontsize=11)
ax3.set_xlabel('標準化回帰係数（95%CI）', fontsize=11)
ax3.set_title(f'Fig3: 着工新設住宅率の決定要因（OLS回帰、2022年）\n'
              f'Adj.R²={model3.rsquared_adj:.3f}  n={len(d3)}',
              fontsize=12, fontweight='bold')

# p値の注釈
for i, (coef, p) in enumerate(zip(coefs, p_values)):
    sig_str = '**' if p < 0.01 else '*' if p < 0.05 else 'n.s.'
    ax3.text(coef + (0.03 if coef >= 0 else -0.03),
             i, sig_str,
             ha='left' if coef >= 0 else 'right',
             va='center', fontsize=11,
             color='#C62828' if sig_str != 'n.s.' else '#888888')

# 凡例
from matplotlib.patches import Patch
legend_elems = [
    Patch(facecolor='#E53935', alpha=0.85, label='有意（p<0.05）'),
    Patch(facecolor='#90A4AE', alpha=0.85, label='非有意'),
]
ax3.legend(handles=legend_elems, fontsize=9, loc='lower right')
ax3.grid(axis='x', alpha=0.3)

fig3.tight_layout()
fig3_path = os.path.join(FIG_DIR, '2022_H5_11_fig3_ols_coef.png')
fig3.savefig(fig3_path, dpi=150, bbox_inches='tight')
plt.close(fig3)
print(f"  保存: {fig3_path}")

# ============================================================
# Fig4: 住宅地地価の都道府県別ランキング（2022年）
# ============================================================
print("Fig4: 住宅地地価ランキング...")

d4 = df2022[['pref', '標準価格（平均価格）（住宅地）']].dropna().copy()
d4 = d4.rename(columns={'標準価格（平均価格）（住宅地）': '住宅地地価'})
d4 = d4.sort_values('住宅地地価', ascending=True).reset_index(drop=True)

# 上位・下位に色分け
n4 = len(d4)
bar_c4 = []
for i in range(n4):
    if i >= n4 - 5:       # 上位5（高い）
        bar_c4.append('#E53935')
    elif i < 5:            # 下位5（低い）
        bar_c4.append('#1565C0')
    else:
        bar_c4.append('#78909C')

fig4, ax4 = plt.subplots(figsize=(9, 11), dpi=150)
ax4.barh(range(n4), d4['住宅地地価'] / 10000, color=bar_c4, alpha=0.85)

ax4.set_yticks(range(n4))
ax4.set_yticklabels(d4['pref'], fontsize=8.5)
ax4.set_xlabel('住宅地地価（万円/m²）', fontsize=11)
ax4.set_title('Fig4: 住宅地地価の都道府県別ランキング（2022年）\n'
              '※ 標準価格（平均価格）（住宅地）', fontsize=12, fontweight='bold')

# 地価の数値ラベル
for i, val in enumerate(d4['住宅地地価']):
    ax4.text(val / 10000 + 0.3, i, f'{val/10000:.1f}',
             va='center', fontsize=7.5, color='#333333')

# 凡例
from matplotlib.patches import Patch as MPatch
legend_items = [
    MPatch(facecolor='#E53935', alpha=0.85, label='地価上位5（高い）'),
    MPatch(facecolor='#1565C0', alpha=0.85, label='地価下位5（低い）'),
    MPatch(facecolor='#78909C', alpha=0.85, label='その他'),
]
ax4.legend(handles=legend_items, fontsize=9, loc='lower right')
ax4.grid(axis='x', alpha=0.3)

fig4.tight_layout()
fig4_path = os.path.join(FIG_DIR, '2022_H5_11_fig4_landprice_rank.png')
fig4.savefig(fig4_path, dpi=150, bbox_inches='tight')
plt.close(fig4)
print(f"  保存: {fig4_path}")

# ============================================================
# 統計サマリーの出力
# ============================================================
print("\n=== 2022年 変数サマリー ===")
summary_cols = ['着工新設住宅率', '転出超過率', '高齢化率',
                '標準価格（平均価格）（住宅地）', '消費支出_log', '求人倍率']
summary_labels = ['着工新設住宅率', '転出超過率', '高齢化率',
                  '住宅地地価', '消費支出_log', '求人倍率']
print(df2022[summary_cols].describe().round(3))

# 相関行列
print("\n=== 相関係数（2022年） ===")
corr_df = df2022[summary_cols].rename(columns=dict(zip(summary_cols, summary_labels)))
print(corr_df.corr().round(3))

print("\n全図の保存完了。")
print(f"  保存先: {os.path.abspath(FIG_DIR)}")