8장 특징의 선택
특징 선택을 위한 내용을 다룹니다.
import numpy as np
threshold = 0.95
corr = agg_df.corr()
mask = np.triu(
np.ones(corr.shape), k=1
).astype(bool)
corr_no_diag = corr.where(mask)
coll = [
c
for c in corr_no_diag.columns
if any(abs(corr_no_diag[c]) > threshold)
]
coll
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
orig_df = df
def tweak_titanic(df):
df = df.drop(
columns=[
"name",
"ticket",
"home.dest",
"boat",
"body",
"cabin",
]
).pipe(pd.get_dummies, drop_first=True)
return df
def get_train_test_X_y(df, y_col, size=0.3, std_cols=None):
y = df[y_col]
X = df.drop(columns=y_col)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=size, random_state=42
)
cols = X.columns
num_cols = [
"pclass",
"age",
"sibsp",
"parch",
"fare",
]
fi = IterativeImputer()
X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])
if std_cols:
std = preprocessing.StandardScaler()
X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
X_test.loc[:, std_cols] = std.transform(X_test[std_cols])
return X_train, X_test, y_train, y_test
ti_df = tweak_titanic(orig_df)
X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, "survived")
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])
import rfpimp
rfpimp.plot_dependence_heatmap(
rfpimp.feature_dependence_matrix(X_train),
value_fontsize=12,
label_fontsize=14,
figsize=(8, 8),
)
fig = plt.gcf()
from sklearn.ensemble import (
RandomForestClassifier,
)
cols_to_remove = [
"pclass",
"sibsp",
"parch",
"embarked_Q",
]
rf3 = RandomForestClassifier(random_state=42)
rf3.fit(
X_train[
[
c
for c in X_train.columns
if c not in cols_to_remove
]
],
y_train,
)
rf3.score(
X_test[
[
c
for c in X_train.columns
if c not in cols_to_remove
]
],
y_test,
)
from sklearn import linear_model
model = linear_model.LassoLarsCV(
cv=10, max_n_alphas=10
).fit(X_train, y_train)
fig, ax = plt.subplots(figsize=(12, 8))
cm = iter(
plt.get_cmap("tab20")(
np.linspace(0, 1, X.shape[1])
)
)
for i in range(X.shape[1]):
c = next(cm)
ax.plot(
model.alphas_,
model.coef_path_.T[:, i],
c=c,
alpha=0.8,
label=X.columns[i],
)
ax.axvline(
model.alpha_,
linestyle="-",
c="k",
label="alphaCV",
)
plt.ylabel("Regression Coefficients")
ax.legend(X.columns, bbox_to_anchor=(1, 1))
plt.xlabel("alpha")
plt.title(
"Regression Coefficients Progression for Lasso Paths"
)
from yellowbrick.features import RFECV
fig, ax = plt.subplots(figsize=(6, 4))
rfe = RFECV(
ensemble.RandomForestClassifier(
n_estimators=100
),
cv=5,
)
rfe.fit(X, y)
rfe.rfe_estimator_.ranking_
rfe.rfe_estimator_.n_features_
rfe.rfe_estimator_.support_
rfe.poof()
from sklearn.feature_selection import RFE
model = ensemble.RandomForestClassifier(
n_estimators=100
)
rfe = RFE(model, 4)
rfe.fit(X, y)
X.columns[rfe.support_]
from sklearn import feature_selection
mic = feature_selection.mutual_info_classif(X, y)
fig, ax = plt.subplots(figsize=(10, 8))
(
pd.DataFrame(
{"feature": X.columns, "vimp": mic}
)
.set_index("feature")
.plot.barh(ax=ax)
)