"""
教育用再現コード: 2022年度 統計データ分析コンペティション 審査員奨励賞（高校生の部）
=================================================================
論文タイトル：観光業と地域経済：延べ宿泊者数が消費支出に与える影響

【分析概要】
  データ：SSDSE-B-2026.csv（都道府県データ、2012〜2023年）
  目的変数：消費支出（二人以上の世帯）
  主要説明変数：
    - 宿泊密度   = 延べ宿泊者数 / 総人口（観光強度の指標）
    - 外国人比率  = 外国人延べ宿泊者数 / 延べ宿泊者数 × 100（インバウンド需要）
    - 旅館密度   = 旅館営業施設数 / 総人口 × 10000（観光インフラ）
    - 高齢化率   = 65歳以上人口 / 総人口 × 100（人口構造）

  Step1. 時系列分析（地域別延べ宿泊者数、COVID-19の影響）
  Step2. 散布図分析（宿泊密度 vs 消費支出、2022年）
  Step3. OLS回帰（消費支出の決定要因、2022年）
  Step4. 外国人比率ランキング（2019年 vs 2022年比較）

【COVID-19の自然実験】
  2020〜2021年の急減は「自然実験（Natural Experiment）」として捉えることができる。
  観光需要の外生的ショックが消費支出に与えた影響を観察することで、
  差の差推定（Difference-in-Differences）の考え方を学ぶことができる。

【変数定義（SSDSE-B-2026列名）】
  SSDSE-B-2026 : 年度
  Prefecture   : 都道府県
  A1101 : 総人口
  A1303 : 65歳以上人口
  C3801 : 旅館営業施設数（ホテルを含む）
  G7101 : 延べ宿泊者数
  G7102 : 外国人延べ宿泊者数
  L3221 : 消費支出（二人以上の世帯）

【データ出典】
  SSDSE-B-2026.csv: 社会・人口統計体系（都道府県データ）
  統計数理研究所・統計センターより公表の実データ

【データサイエンス学習ポイント】
  1. 観光の乗数効果（経済波及効果の概念と統計的測定）
  2. COVID-19を自然実験として使う発想（差の差推定の基礎）
  3. 外国人比率の計算と解釈（インバウンド需要の定量化）
  4. 宿泊密度指標の設計（人口比による標準化の意味）
=================================================================
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2022_H5_5_shorei.py
# ============================================================


import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ──────────────────────────────────────────────────────────────
# 共通設定
# ──────────────────────────────────────────────────────────────
plt.rcParams['font.family'] = 'Hiragino Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 150

import os
FIG_DIR = 'html/figures'
DATA_B  = 'data/raw/SSDSE-B-2026.csv'
os.makedirs(FIG_DIR, exist_ok=True)

# ================================================================
# ■ 実データ読み込み（SSDSE-B-2026: 都道府県データ）
# ================================================================
df_raw = pd.read_csv(DATA_B, encoding='shift-jis', header=0)
# 行0がラベル行、行1以降が実データ
df_all = df_raw.iloc[1:].copy()
df_all.columns = df_raw.columns

# 数値列に変換
NUM_COLS = ['A1101', 'A1303', 'C3801', 'G7101', 'G7102', 'L3221']
for c in NUM_COLS:
    df_all[c] = pd.to_numeric(df_all[c], errors='coerce')

df_all['年度'] = df_all['SSDSE-B-2026'].astype(int)
df_all['都道府県'] = df_all['Prefecture']

print("=" * 65)
print("■ SSDSE-B-2026 読み込み完了")
print(f"  年度範囲: {df_all['年度'].min()}〜{df_all['年度'].max()}")
print(f"  都道府県数（各年）: {df_all.groupby('年度')['都道府県'].count().iloc[0]}件")
print("=" * 65)

# ================================================================
# ■ 特徴量エンジニアリング（全年度）
# ================================================================
df_all['宿泊密度']     = df_all['G7101'] / df_all['A1101'].clip(1)
df_all['外国人比率']   = df_all['G7102'] / df_all['G7101'].clip(1) * 100
df_all['消費支出_万円'] = df_all['L3221'] / 10000
df_all['旅館密度']     = df_all['C3801'] / df_all['A1101'].clip(1) * 10000
df_all['高齢化率']     = df_all['A1303'] / df_all['A1101'].clip(1) * 100

# 2022年データ抽出
df_2022 = df_all[df_all['年度'] == 2022].copy().reset_index(drop=True)
# 2019年データ抽出（コロナ前比較用）
df_2019 = df_all[df_all['年度'] == 2019].copy().reset_index(drop=True)

print(f"\n■ 2022年データ: {len(df_2022)}都道府県")
print(f"  宿泊密度（平均）: {df_2022['宿泊密度'].mean():.4f}")
print(f"  外国人比率（平均）: {df_2022['外国人比率'].mean():.2f}%")
print(f"  消費支出（平均）: {df_2022['消費支出_万円'].mean():.2f}万円/月")

# ================================================================
# ■ OLS回帰分析（2022年、消費支出の決定要因）
# ================================================================
print("\n" + "=" * 65)
print("■ OLS回帰分析（2022年）")
print("=" * 65)

REG_VARS = ['宿泊密度', '外国人比率', '旅館密度', '高齢化率']
REG_LABELS = ['宿泊密度', '外国人比率(%)', '旅館密度(万人対)', '高齢化率(%)']

df_reg = df_2022[['消費支出_万円'] + REG_VARS].dropna()
y_ols = df_reg['消費支出_万円'].values
X_ols = df_reg[REG_VARS].values

# 標準化（係数の比較のため）
X_std = (X_ols - X_ols.mean(axis=0)) / X_ols.std(axis=0)
X_with_const = sm.add_constant(X_std)
ols_model = sm.OLS(y_ols, X_with_const).fit()

print(ols_model.summary2())
print(f"\n  R² = {ols_model.rsquared:.4f}")
print(f"  自由度修正済R² = {ols_model.rsquared_adj:.4f}")

# 相関分析
print("\n■ 相関分析（宿泊密度 vs 消費支出_万円）")
r_corr, p_corr = stats.pearsonr(df_2022['宿泊密度'].dropna(),
                                  df_2022['消費支出_万円'].dropna())
print(f"  r = {r_corr:.4f}, p = {p_corr:.4f}")

# ================================================================
# ■ 図1: 延べ宿泊者数の時系列推移（地域別、COVID期の影響）
# ================================================================
print("\n" + "=" * 65)
print("■ 図1: 時系列推移（地域別）")
print("=" * 65)

# 地方区分
REGION_MAP = {
    '北海道': '北海道・東北',
    '青森県': '北海道・東北', '岩手県': '北海道・東北', '宮城県': '北海道・東北',
    '秋田県': '北海道・東北', '山形県': '北海道・東北', '福島県': '北海道・東北',
    '茨城県': '関東', '栃木県': '関東', '群馬県': '関東', '埼玉県': '関東',
    '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部', '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国・四国', '島根県': '中国・四国', '岡山県': '中国・四国',
    '広島県': '中国・四国', '山口県': '中国・四国',
    '徳島県': '中国・四国', '香川県': '中国・四国', '愛媛県': '中国・四国', '高知県': '中国・四国',
    '福岡県': '九州・沖縄', '佐賀県': '九州・沖縄', '長崎県': '九州・沖縄',
    '熊本県': '九州・沖縄', '大分県': '九州・沖縄', '宮崎県': '九州・沖縄',
    '鹿児島県': '九州・沖縄', '沖縄県': '九州・沖縄',
}
REGION_ORDER = ['北海道・東北', '関東', '中部', '近畿', '中国・四国', '九州・沖縄']
REGION_COLORS = {
    '北海道・東北': '#1565C0',
    '関東':         '#E65100',
    '中部':         '#2E7D32',
    '近畿':         '#6A1B9A',
    '中国・四国':   '#795548',
    '九州・沖縄':   '#00695C',
}

df_all['地方'] = df_all['都道府県'].map(REGION_MAP)
df_region_ts = (
    df_all.groupby(['年度', '地方'])['G7101']
    .sum()
    .reset_index()
)
df_region_ts['宿泊者数_億人'] = df_region_ts['G7101'] / 1e8

fig1, ax1 = plt.subplots(figsize=(11, 6))
for region in REGION_ORDER:
    sub = df_region_ts[df_region_ts['地方'] == region].sort_values('年度')
    if len(sub) == 0:
        continue
    ax1.plot(sub['年度'], sub['宿泊者数_億人'],
             marker='o', linewidth=2, markersize=5,
             color=REGION_COLORS[region], label=region)

# COVID期を強調
ax1.axvspan(2019.5, 2021.5, alpha=0.12, color='#C62828', zorder=0)
ax1.axvline(2019.5, color='#C62828', linestyle='--', linewidth=1.2, alpha=0.7)
ax1.axvline(2021.5, color='#C62828', linestyle='--', linewidth=1.2, alpha=0.7)
ax1.text(2020.4, ax1.get_ylim()[1] * 0.97, 'COVID-19\n急減期',
         ha='center', va='top', fontsize=9.5, color='#C62828', fontweight='bold')

ax1.set_xlabel('年度', fontsize=12)
ax1.set_ylabel('延べ宿泊者数（億人泊）', fontsize=12)
ax1.set_title('延べ宿泊者数の時系列推移（地方別）\n〜COVID-19による急減と回復〜',
              fontsize=13, fontweight='bold')
ax1.set_xticks(sorted(df_region_ts['年度'].unique()))
ax1.xaxis.set_tick_params(rotation=45)
ax1.legend(loc='upper left', fontsize=9.5, framealpha=0.85)
ax1.grid(True, alpha=0.3)
plt.tight_layout()
fig1.savefig(os.path.join(FIG_DIR, '2022_H5_5_fig1_timeseries.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig1)
print("図1保存: 2022_H5_5_fig1_timeseries.png")

# ================================================================
# ■ 図2: 宿泊密度 vs 消費支出 散布図（2022年、都道府県ラベル付き）
# ================================================================
print("\n■ 図2: 散布図（宿泊密度 vs 消費支出）")

fig2, ax2 = plt.subplots(figsize=(11, 7))

x2 = df_2022['宿泊密度'].values
y2 = df_2022['消費支出_万円'].values
pref2 = df_2022['都道府県'].values

# 散布図
sc2 = ax2.scatter(x2, y2, c=df_2022['外国人比率'].values,
                  cmap='YlOrRd', s=70, alpha=0.85,
                  edgecolors='#333', linewidth=0.5, zorder=3)
cbar2 = fig2.colorbar(sc2, ax=ax2, label='外国人比率（%）', shrink=0.8)
cbar2.ax.tick_params(labelsize=9)

# 回帰直線
valid_mask = np.isfinite(x2) & np.isfinite(y2)
z2 = np.polyfit(x2[valid_mask], y2[valid_mask], 1)
xs2 = np.linspace(x2[valid_mask].min(), x2[valid_mask].max(), 100)
ax2.plot(xs2, np.poly1d(z2)(xs2), 'b--', linewidth=1.8, alpha=0.7,
         label=f'回帰直線 (r={r_corr:.3f}, p={p_corr:.4f})')

# 都道府県ラベル（上位・下位・注目都市）
LABEL_PREFS = {
    '東京都', '北海道', '沖縄県', '京都府', '大阪府',
    '長野県', '山梨県', '石川県', '秋田県', '鳥取県',
    '神奈川県', '愛知県', '福岡県',
}
for i, (xi, yi, pref) in enumerate(zip(x2, y2, pref2)):
    if pref in LABEL_PREFS:
        short = pref.replace('県', '').replace('都', '').replace('道', '').replace('府', '')
        ax2.annotate(short, (xi, yi),
                     xytext=(4, 3), textcoords='offset points',
                     fontsize=7.5, color='#1A237E',
                     fontweight='bold')

ax2.set_xlabel('宿泊密度（延べ宿泊者数／総人口）', fontsize=12)
ax2.set_ylabel('消費支出（万円/月, 二人以上の世帯）', fontsize=12)
ax2.set_title('宿泊密度と消費支出の関係（2022年, 都道府県別）\n〜色：外国人延べ宿泊比率〜',
              fontsize=13, fontweight='bold')
ax2.legend(fontsize=9.5)
ax2.grid(True, alpha=0.3)
plt.tight_layout()
fig2.savefig(os.path.join(FIG_DIR, '2022_H5_5_fig2_scatter.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig2)
print("図2保存: 2022_H5_5_fig2_scatter.png")

# ================================================================
# ■ 図3: OLS回帰係数プロット（消費支出の決定要因）
# ================================================================
print("\n■ 図3: OLS回帰係数プロット")

coefs3 = np.asarray(ols_model.params[1:])
ses3   = np.asarray(ols_model.bse[1:])
pvals3 = np.asarray(ols_model.pvalues[1:])
ci_lo3 = coefs3 - 1.96 * ses3
ci_hi3 = coefs3 + 1.96 * ses3

COEF_COLORS = []
for p in pvals3:
    if p < 0.01:
        COEF_COLORS.append('#C62828')
    elif p < 0.05:
        COEF_COLORS.append('#FF8F00')
    elif p < 0.10:
        COEF_COLORS.append('#FDD835')
    else:
        COEF_COLORS.append('#9E9E9E')

fig3, ax3 = plt.subplots(figsize=(9, 5))
y_pos3 = np.arange(len(REG_LABELS))

ax3.barh(y_pos3, coefs3, color=COEF_COLORS, alpha=0.80,
         edgecolor='white', height=0.55)
ax3.errorbar(coefs3, y_pos3, xerr=1.96 * ses3,
             fmt='none', color='#222', capsize=5, linewidth=1.5)
ax3.axvline(0, color='gray', linestyle='--', linewidth=1.0)
ax3.set_yticks(y_pos3)
ax3.set_yticklabels(REG_LABELS, fontsize=11)
ax3.set_xlabel('標準化回帰係数（±1.96SE）', fontsize=11)
ax3.set_title(
    f'消費支出の決定要因 — OLS回帰係数（2022年, N={len(df_reg)}都道府県）\n'
    f'R²={ols_model.rsquared:.3f}（adj. R²={ols_model.rsquared_adj:.3f}）',
    fontsize=12, fontweight='bold'
)
ax3.invert_yaxis()
ax3.grid(axis='x', alpha=0.3)

# p値ラベル
for i, (c, p) in enumerate(zip(coefs3, pvals3)):
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'n.s.'
    offset = 0.008 if c >= 0 else -0.008
    ha = 'left' if c >= 0 else 'right'
    ax3.text(c + offset, i, f' {c:.3f} {sig}',
             va='center', ha=ha, fontsize=8.5)

from matplotlib.patches import Patch
ax3.legend(handles=[
    Patch(color='#C62828', alpha=0.8, label='p<0.01'),
    Patch(color='#FF8F00', alpha=0.8, label='p<0.05'),
    Patch(color='#FDD835', alpha=0.8, label='p<0.10'),
    Patch(color='#9E9E9E', alpha=0.8, label='n.s.'),
], fontsize=9, loc='lower right')
plt.tight_layout()
fig3.savefig(os.path.join(FIG_DIR, '2022_H5_5_fig3_coef.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig3)
print("図3保存: 2022_H5_5_fig3_coef.png")

# ================================================================
# ■ 図4: 外国人宿泊比率の都道府県別ランキング（2019年 vs 2022年）
# ================================================================
print("\n■ 図4: 外国人比率ランキング（2019 vs 2022）")

# 2019年・2022年の外国人比率
df_inbound = pd.merge(
    df_2019[['都道府県', '外国人比率']].rename(columns={'外国人比率': '外国人比率_2019'}),
    df_2022[['都道府県', '外国人比率']].rename(columns={'外国人比率': '外国人比率_2022'}),
    on='都道府県', how='inner'
).dropna()

# 2019年基準でソート（上位20都道府県）
df_inbound_sorted = df_inbound.sort_values('外国人比率_2019', ascending=True).tail(20)
prefs4 = df_inbound_sorted['都道府県'].str.replace('県', '').str.replace('都', '') \
                                       .str.replace('道', '').str.replace('府', '').values
v2019 = df_inbound_sorted['外国人比率_2019'].values
v2022 = df_inbound_sorted['外国人比率_2022'].values

fig4, ax4 = plt.subplots(figsize=(10, 8))
y_pos4 = np.arange(len(prefs4))
bar_h = 0.38

bars_2019 = ax4.barh(y_pos4 + bar_h / 2, v2019, height=bar_h,
                      color='#1565C0', alpha=0.80, label='2019年（コロナ前）',
                      edgecolor='white')
bars_2022 = ax4.barh(y_pos4 - bar_h / 2, v2022, height=bar_h,
                      color='#E65100', alpha=0.80, label='2022年（コロナ後回復期）',
                      edgecolor='white')

# 数値ラベル
for bar, val in zip(bars_2019, v2019):
    ax4.text(val + 0.2, bar.get_y() + bar.get_height() / 2,
             f'{val:.1f}%', va='center', ha='left', fontsize=7.5)
for bar, val in zip(bars_2022, v2022):
    ax4.text(val + 0.2, bar.get_y() + bar.get_height() / 2,
             f'{val:.1f}%', va='center', ha='left', fontsize=7.5)

ax4.set_yticks(y_pos4)
ax4.set_yticklabels(prefs4, fontsize=9.5)
ax4.set_xlabel('外国人延べ宿泊比率（%）', fontsize=11)
ax4.set_title('外国人宿泊比率ランキング — 2019年 vs 2022年\n〜上位20都道府県、2019年基準〜',
              fontsize=13, fontweight='bold')
ax4.legend(fontsize=10)
ax4.grid(axis='x', alpha=0.3)
plt.tight_layout()
fig4.savefig(os.path.join(FIG_DIR, '2022_H5_5_fig4_inbound.png'),
             bbox_inches='tight', dpi=150)
plt.close(fig4)
print("図4保存: 2022_H5_5_fig4_inbound.png")

# ================================================================
# ■ 完了メッセージ
# ================================================================
print("\n" + "=" * 65)
print("■ 全図生成完了（4枚）")
print(f"  fig1_timeseries.png : 延べ宿泊者数の時系列推移（地方別）")
print(f"  fig2_scatter.png    : 宿泊密度 vs 消費支出 散布図（2022年）")
print(f"  fig3_coef.png       : OLS回帰係数プロット")
print(f"  fig4_inbound.png    : 外国人比率ランキング（2019 vs 2022）")
print("=" * 65)