"""
2023年度 統計データ分析コンペ 統計数理賞（大学生）
論文：CO2排出特性と地域特性の関係―2050年カーボンニュートラルの実現に向けて―

手法：相関分析・地域分類・散布図・LQ（立地係数）スタイル分析
データ：SSDSE-B-2026.csv（都道府県別統計データ）

CO2プロキシ変数:
  - L322103 / L3221: 光熱・水道費割合（エネルギー消費強度の代理）
  - L322107 / L3221: 交通・通信費割合（交通エネルギー消費の代理）
  - H5610: 1人1日ごみ排出量（産業活動量の代理）
"""

# ============================================================
# 【データの準備】実行前に以下のデータファイルを用意してください
#
#   必要ファイル:
#     ・SSDSE-B-2026.csv
#       → data/raw/SSDSE-B-2026.csv に配置
#
#   ダウンロード先:
#     https://www.nstac.go.jp/use/literacy/ssdse/
#     （SSDSE-B（社会・人口統計体系 都道府県データ） の CSV をダウンロード）
#
#   フォルダ配置（プロジェクトルートからの相対パス）:
#     code/                ← このスクリプトの場所
#     data/raw/            ← CSV ファイルをここに配置
#     html/figures/        ← 図の出力先（自動生成）
#
#   実行方法（ファイルを一切編集せず実行可能）:
#     python3 code/2023_U3_suri.py
# ============================================================


import os
import warnings
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import rcParams
from scipy import stats

warnings.filterwarnings('ignore')

# ---------- パス設定 ----------
_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in dir() else os.getcwd()
FIG_DIR = os.path.join(_dir, '..', 'html', 'figures')
DATA_B  = os.path.join(_dir, '..', 'data', 'raw', 'SSDSE-B-2026.csv')

os.makedirs(FIG_DIR, exist_ok=True)

# ---------- フォント設定 ----------
rcParams['font.family'] = ['Hiragino Sans', 'Hiragino Kaku Gothic ProN',
                            'Noto Sans CJK JP', 'DejaVu Sans']
rcParams['axes.unicode_minus'] = False

# ---------- データ読み込み ----------
df_raw = pd.read_csv(DATA_B, header=0, encoding='cp932')

# 都道府県コード（R+5桁）の行のみ抽出
mask = df_raw['Code'].astype(str).str.match(r'^R\d{5}$')
df_all = df_raw[mask].copy()

# 2022年度データ（47都道府県）
df = df_all[df_all['SSDSE-B-2026'].astype(str) == '2022'].copy()
df = df.reset_index(drop=True)

# ---------- 数値変換 ----------
num_cols = ['L3221', 'L322103', 'L322107', 'H5610',
            'B4101', 'B4109', 'A1303', 'A1101',
            'F3103', 'F3102']
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

# ---------- 分析変数の作成 ----------
# 光熱・水道費割合（%）
df['energy_ratio'] = df['L322103'] / df['L3221'] * 100

# 交通・通信費割合（%）
df['transport_ratio'] = df['L322107'] / df['L3221'] * 100

# CO2排出強度プロキシ = (光熱費割合 + 交通費割合) × 消費支出総額 / 100
df['co2_proxy'] = (df['energy_ratio'] + df['transport_ratio']) * df['L3221'] / 100

# 1人1日ごみ排出量（g/人/日）
df['waste'] = df['H5610']

# 有効求人倍率 = 有効求職者数 / 有効求人数
df['job_ratio'] = df['F3103'] / df['F3102']

# 高齢化率（%）= 65歳以上人口 / 総人口
df['aging_rate'] = df['A1303'] / df['A1101'] * 100

# 気温（年平均, ℃）
df['temp'] = df['B4101']

# 降水量（年間, mm）
df['precip'] = df['B4109']

# ---------- LQ（立地係数）の計算 ----------
# エネルギーLQ：都道府県の光熱費割合 / 全国平均光熱費割合
national_energy_ratio    = df['energy_ratio'].mean()
national_transport_ratio = df['transport_ratio'].mean()
national_waste           = df['waste'].mean()

df['energy_lq']    = df['energy_ratio']    / national_energy_ratio
df['transport_lq'] = df['transport_ratio'] / national_transport_ratio
df['waste_lq']     = df['waste']           / national_waste

# ---------- 地域分類 ----------
region_map = {
    '北海道': '北海道・東北', '青森県': '北海道・東北', '岩手県': '北海道・東北',
    '宮城県': '北海道・東北', '秋田県': '北海道・東北', '山形県': '北海道・東北',
    '福島県': '北海道・東北',
    '茨城県': '関東', '栃木県': '関東', '群馬県': '関東',
    '埼玉県': '関東', '千葉県': '関東', '東京都': '関東', '神奈川県': '関東',
    '新潟県': '中部', '富山県': '中部', '石川県': '中部', '福井県': '中部',
    '山梨県': '中部', '長野県': '中部', '岐阜県': '中部', '静岡県': '中部',
    '愛知県': '中部',
    '三重県': '近畿', '滋賀県': '近畿', '京都府': '近畿', '大阪府': '近畿',
    '兵庫県': '近畿', '奈良県': '近畿', '和歌山県': '近畿',
    '鳥取県': '中国・四国', '島根県': '中国・四国', '岡山県': '中国・四国',
    '広島県': '中国・四国', '山口県': '中国・四国', '徳島県': '中国・四国',
    '香川県': '中国・四国', '愛媛県': '中国・四国', '高知県': '中国・四国',
    '福岡県': '九州・沖縄', '佐賀県': '九州・沖縄', '長崎県': '九州・沖縄',
    '熊本県': '九州・沖縄', '大分県': '九州・沖縄', '宮崎県': '九州・沖縄',
    '鹿児島県': '九州・沖縄', '沖縄県': '九州・沖縄',
}
df['region'] = df['Prefecture'].map(region_map).fillna('その他')

# LQによる高中低分類（エネルギー強度）
df['lq_class'] = pd.cut(
    df['energy_lq'],
    bins=[0, 0.9, 1.1, 99],
    labels=['低エネルギー強度\n(LQ<0.9)', '中エネルギー強度\n(0.9≤LQ<1.1)', '高エネルギー強度\n(LQ≥1.1)']
)

# ---------- 地域カラー設定 ----------
region_colors = {
    '北海道・東北': '#2196F3',
    '関東':         '#F44336',
    '中部':         '#FF9800',
    '近畿':         '#9C27B0',
    '中国・四国':   '#009688',
    '九州・沖縄':   '#795548',
}


# ============================================================
# 図1: 相関行列ヒートマップ
# ============================================================
fig1_vars = {
    'CO2プロキシ\n(エネルギー×支出)': df['co2_proxy'],
    'エネルギー費\n割合(%)':           df['energy_ratio'],
    '交通費\n割合(%)':                  df['transport_ratio'],
    'ごみ排出量\n(g/人/日)':            df['waste'],
    '気温(℃)':                         df['temp'],
    '降水量(mm)':                       df['precip'],
    '高齢化率(%)':                      df['aging_rate'],
    '有効求人\n倍率':                   df['job_ratio'],
}

fig1_df = pd.DataFrame(fig1_vars)
corr_matrix = fig1_df.corr()

fig, ax = plt.subplots(figsize=(9, 7.5))
n = len(corr_matrix)
im = ax.imshow(corr_matrix.values, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')
plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label='相関係数')

ax.set_xticks(range(n))
ax.set_yticks(range(n))
labels = list(fig1_vars.keys())
ax.set_xticklabels(labels, fontsize=9, rotation=0)
ax.set_yticklabels(labels, fontsize=9)

for i in range(n):
    for j in range(n):
        val = corr_matrix.values[i, j]
        color = 'white' if abs(val) > 0.5 else 'black'
        ax.text(j, i, f'{val:.2f}', ha='center', va='center', fontsize=8, color=color, fontweight='bold')

ax.set_title('図1：CO2プロキシ・気候・人口指標の相関行列\n（2022年 47都道府県 SSDSE-B）',
             fontsize=12, pad=14, fontweight='bold')
plt.tight_layout()
fig_path = os.path.join(FIG_DIR, '2023_U3_fig1_corr.png')
plt.savefig(fig_path, dpi=150, bbox_inches='tight')
plt.close()
print(f'図1 保存: {fig_path}')


# ============================================================
# 図2: LQ散布図（エネルギーLQ vs ごみLQ, 地域別色分け）
# ============================================================
fig, ax = plt.subplots(figsize=(9, 6.5))

for region, grp in df.groupby('region'):
    color = region_colors.get(region, '#607D8B')
    ax.scatter(grp['energy_lq'], grp['waste_lq'],
               color=color, label=region, s=70, alpha=0.85,
               edgecolors='white', linewidths=0.5, zorder=3)
    for _, row in grp.iterrows():
        pref = row['Prefecture'].replace('県', '').replace('府', '').replace('都', '').replace('道', '')
        ax.annotate(pref, (row['energy_lq'], row['waste_lq']),
                    fontsize=7, xytext=(3, 2), textcoords='offset points', alpha=0.85)

ax.axhline(1.0, color='gray', linewidth=0.8, linestyle='--', alpha=0.6)
ax.axvline(1.0, color='gray', linewidth=0.8, linestyle='--', alpha=0.6)
ax.fill_between([1.0, ax.get_xlim()[1] if ax.get_xlim()[1] > 1 else 2],
                1.0, 2.0, alpha=0.04, color='red')

ax.set_xlabel('エネルギー費LQ（光熱・水道費特化係数）', fontsize=11)
ax.set_ylabel('ごみ排出LQ（1人1日ごみ排出量特化係数）', fontsize=11)
ax.set_title('図2：LQ散布図 ― エネルギー特化係数 vs ごみ排出特化係数\n（2022年 47都道府県 / 地域別色分け）',
             fontsize=12, pad=12, fontweight='bold')

legend = ax.legend(title='地域', fontsize=9, title_fontsize=9,
                   loc='upper right', framealpha=0.9)
ax.text(1.02, 1.02, '両LQ>1\n(高エネルギー+高ごみ)',
        fontsize=8, color='red', alpha=0.7, transform=ax.transAxes,
        ha='right', va='bottom')

ax.grid(True, alpha=0.3, linewidth=0.5)
plt.tight_layout()
fig_path = os.path.join(FIG_DIR, '2023_U3_fig2_lq.png')
plt.savefig(fig_path, dpi=150, bbox_inches='tight')
plt.close()
print(f'図2 保存: {fig_path}')


# ============================================================
# 図3: CO2プロキシ 上位10・下位10 棒グラフ
# ============================================================
df_sorted = df[['Prefecture', 'co2_proxy', 'region']].sort_values('co2_proxy', ascending=False).reset_index(drop=True)
pref_label = df_sorted['Prefecture'].str.replace('県', '').str.replace('府', '').str.replace('都', '').str.replace('道', '')

top10    = df_sorted.head(10)
bottom10 = df_sorted.tail(10)

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# --- 上位10 ---
ax = axes[0]
colors_top = [region_colors.get(r, '#607D8B') for r in top10['region']]
bars = ax.barh(range(10), top10['co2_proxy'].values / 1000,
               color=colors_top, edgecolor='white', linewidth=0.5)
ax.set_yticks(range(10))
labels_top = top10['Prefecture'].str.replace('県','').str.replace('府','').str.replace('都','').str.replace('道','').tolist()
ax.set_yticklabels(labels_top, fontsize=10)
ax.invert_yaxis()
ax.set_xlabel('CO2排出強度プロキシ（千円/世帯）', fontsize=10)
ax.set_title('CO2プロキシ 上位10都道府県', fontsize=11, fontweight='bold')
ax.grid(axis='x', alpha=0.3, linewidth=0.5)
for i, (bar, val) in enumerate(zip(bars, top10['co2_proxy'].values)):
    ax.text(bar.get_width() + 0.3, i, f'{val/1000:.1f}',
            va='center', fontsize=8, color='#333')

# --- 下位10 ---
ax = axes[1]
bottom10_sorted = bottom10.sort_values('co2_proxy', ascending=True)
colors_bot = [region_colors.get(r, '#607D8B') for r in bottom10_sorted['region']]
bars = ax.barh(range(10), bottom10_sorted['co2_proxy'].values / 1000,
               color=colors_bot, edgecolor='white', linewidth=0.5)
ax.set_yticks(range(10))
labels_bot = bottom10_sorted['Prefecture'].str.replace('県','').str.replace('府','').str.replace('都','').str.replace('道','').tolist()
ax.set_yticklabels(labels_bot, fontsize=10)
ax.set_xlabel('CO2排出強度プロキシ（千円/世帯）', fontsize=10)
ax.set_title('CO2プロキシ 下位10都道府県', fontsize=11, fontweight='bold')
ax.grid(axis='x', alpha=0.3, linewidth=0.5)
for i, (bar, val) in enumerate(zip(bars, bottom10_sorted['co2_proxy'].values)):
    ax.text(bar.get_width() + 0.3, i, f'{val/1000:.1f}',
            va='center', fontsize=8, color='#333')

# 地域凡例
handles = [mpatches.Patch(color=c, label=r) for r, c in region_colors.items()]
fig.legend(handles=handles, title='地域', fontsize=9, title_fontsize=9,
           loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.05), framealpha=0.9)

fig.suptitle('図3：CO2排出強度プロキシによる都道府県ランキング（2022年）',
             fontsize=13, fontweight='bold', y=1.02)
plt.tight_layout()
fig_path = os.path.join(FIG_DIR, '2023_U3_fig3_rank.png')
plt.savefig(fig_path, dpi=150, bbox_inches='tight')
plt.close()
print(f'図3 保存: {fig_path}')


# ============================================================
# 図4: 気温 vs CO2プロキシ 散布図（回帰直線付き）
# ============================================================
x_val = df['temp'].values
y_val = df['co2_proxy'].values / 1000  # 千円単位

# 回帰直線
slope, intercept, r_val, p_val, se = stats.linregress(x_val, y_val)
x_line = np.linspace(x_val.min() - 0.5, x_val.max() + 0.5, 100)
y_line = slope * x_line + intercept

fig, ax = plt.subplots(figsize=(9, 6))

for region, grp in df.groupby('region'):
    color = region_colors.get(region, '#607D8B')
    gx = pd.to_numeric(grp['temp'], errors='coerce').values
    gy = grp['co2_proxy'].values / 1000
    ax.scatter(gx, gy, color=color, label=region, s=70, alpha=0.85,
               edgecolors='white', linewidths=0.5, zorder=3)
    for _, row in grp.iterrows():
        pref = row['Prefecture'].replace('県', '').replace('府', '').replace('都', '').replace('道', '')
        ax.annotate(pref, (row['temp'], row['co2_proxy'] / 1000),
                    fontsize=7, xytext=(3, 2), textcoords='offset points', alpha=0.85)

ax.plot(x_line, y_line, color='#D32F2F', linewidth=1.8, linestyle='--',
        label=f'回帰直線 (r={r_val:.3f}, p={p_val:.3f})', zorder=4)

sig_str = '（統計的に有意）' if p_val < 0.05 else '（非有意）'
ax.set_xlabel('年平均気温（℃）', fontsize=11)
ax.set_ylabel('CO2排出強度プロキシ（千円/世帯）', fontsize=11)
ax.set_title(f'図4：年平均気温 vs CO2排出強度プロキシ\nr = {r_val:.3f}, p = {p_val:.4f} {sig_str}',
             fontsize=12, pad=12, fontweight='bold')

ax.legend(fontsize=9, loc='upper right', framealpha=0.9)
ax.grid(True, alpha=0.3, linewidth=0.5)
plt.tight_layout()
fig_path = os.path.join(FIG_DIR, '2023_U3_fig4_temp.png')
plt.savefig(fig_path, dpi=150, bbox_inches='tight')
plt.close()
print(f'図4 保存: {fig_path}')


# ============================================================
# 補足：相関係数の表示
# ============================================================
print('\n===== 相関分析結果（CO2プロキシ vs 各指標） =====')
targets = {
    '気温(B4101)':   df['temp'],
    '降水量(B4109)': df['precip'],
    '高齢化率':      df['aging_rate'],
    '有効求人倍率':  df['job_ratio'],
    'ごみ排出量':    df['waste'],
}
for name, series in targets.items():
    x_ = df['co2_proxy'].values
    y_ = series.values
    mask2 = ~(np.isnan(x_) | np.isnan(y_))
    if mask2.sum() > 5:
        r_, p_ = stats.pearsonr(x_[mask2], y_[mask2])
        sig = '*' if p_ < 0.05 else ''
        print(f'  {name:15s}: r={r_:+.3f}, p={p_:.4f} {sig}')

print('\n===== LQ分類（エネルギー強度） =====')
print(df.groupby('lq_class', observed=True)['Prefecture'].apply(
    lambda x: ', '.join(x.str.replace('県','').str.replace('府','').str.replace('都','').str.replace('道',''))
).to_string())

print('\n===== CO2プロキシ 上位5 / 下位5 =====')
print('上位5:', df.nlargest(5, 'co2_proxy')[['Prefecture', 'co2_proxy']].to_string(index=False))
print('下位5:', df.nsmallest(5, 'co2_proxy')[['Prefecture', 'co2_proxy']].to_string(index=False))

print('\n全図の生成が完了しました。')