論文中に 「ステップワイズ法」として登場する用語。
ステップワイズ法 とは:AIC等を基準に、説明変数を1つずつ追加・削除して最適なモデルを探索する自動変数選択法。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import pandas as pd import statsmodels.api as sm df = pd.read_csv('data/raw/SSDSE-B-2026.csv', encoding='cp932', skiprows=[1]) df = df[df['年度']==2023].dropna() y = df['D1101'] candidate_cols = ['A1101', 'A4101', 'F2401', 'E1501', 'I5101'] def forward_selection(X_pool, y, threshold=2.0): selected = [] current_aic = sm.OLS(y, sm.add_constant(pd.DataFrame(index=y.index))).fit().aic while True: candidates = [c for c in X_pool.columns if c not in selected] if not candidates: break aics = {} for c in candidates: X = sm.add_constant(X_pool[selected + [c]]) aics[c] = sm.OLS(y, X).fit().aic best_var = min(aics, key=aics.get) if current_aic - aics[best_var] break selected.append(best_var) current_aic = aics[best_var] print(f'Add {best_var}, AIC={current_aic:.2f}') return selected selected = forward_selection(df[candidate_cols], y) print(f'Selected: {selected}') |
1 2 3 4 5 6 7 8 9 10 11 12 13 | from mlxtend.feature_selection import SequentialFeatureSelector as SFS from sklearn.linear_model import LinearRegression X = df[candidate_cols].values sfs = SFS(LinearRegression(), k_features='best', forward=True, ## True=前進、 False=後退 floating=True, ## 双方向(floating selection) scoring='r2', cv=5) sfs.fit(X, y.values) print(f'選ばれた特徴量: {[candidate_cols[i] for i in sfs.k_feature_idx_]}') print(f'CV R²: {sfs.k_score_:.3f}') |
1 2 3 4 5 6 7 8 9 10 11 12 13 | from sklearn.feature_selection import SequentialFeatureSelector, RFE, RFECV ## CV ベースの逐次選択 sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select='auto', tol=0.001, direction='forward', cv=5) sfs.fit(X, y) print(f'選択: {sfs.get_support()}') ## RFE — 再帰的特徴削除(Backward の一般化) rfe = RFECV(LinearRegression(), step=1, cv=5, scoring='r2') rfe.fit(X, y) print(f'最適変数数: {rfe.n_features_}') print(f'ランキング: {rfe.ranking_}') |
1 2 3 4 5 6 7 8 9 | from sklearn.linear_model import LassoCV from sklearn.preprocessing import StandardScaler ## Lasso は標準化必須(係数のスケールが揃わないと意味がない) X_scaled = StandardScaler().fit_transform(X) lasso = LassoCV(cv=5, random_state=42).fit(X_scaled, y) print(f'最適 α: {lasso.alpha_:.4f}') print(f'非ゼロ係数の変数: {[c for c, v in zip(candidate_cols, lasso.coef_) if abs(v) > 1e-6]}') print(f'係数: {dict(zip(candidate_cols, lasso.coef_.round(3)))}') |