8.1 공선성을 가진 열

import numpy as np

threshold = 0.95

corr = agg_df.corr()
mask = np.triu(
  np.ones(corr.shape), k=1
).astype(bool)

corr_no_diag = corr.where(mask)
coll = [
  c
  for c in corr_no_diag.columns
  if any(abs(corr_no_diag[c]) > threshold)
]

coll
['pclass_min',
 'pclass_max',
 'pclass_mean',
 'sibsp_mean',
 'parch_mean',
 'fare_mean',
 'body_min',
 'body_max',
 'body_mean',
 'body_sum']
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
orig_df = df

def tweak_titanic(df):
  df = df.drop(
    columns=[
      "name",
      "ticket",
      "home.dest",
      "boat",
      "body",
      "cabin",
    ]
  ).pipe(pd.get_dummies, drop_first=True)
  return df

def get_train_test_X_y(df, y_col, size=0.3, std_cols=None):
  y = df[y_col]
  X = df.drop(columns=y_col)
  X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=size, random_state=42
  )
  cols = X.columns
  num_cols = [
    "pclass",
    "age",
    "sibsp",
    "parch",
    "fare",
  ]

  fi = IterativeImputer()

  X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
  X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])

  if std_cols:
    std = preprocessing.StandardScaler()
    X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
    X_test.loc[:, std_cols] = std.transform(X_test[std_cols])

  return X_train, X_test, y_train, y_test  

ti_df = tweak_titanic(orig_df)
X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, "survived")  

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])
import rfpimp

rfpimp.plot_dependence_heatmap(
  rfpimp.feature_dependence_matrix(X_train),
  value_fontsize=12,
  label_fontsize=14,
  figsize=(8, 8),
)

fig = plt.gcf()
from sklearn.ensemble import (
  RandomForestClassifier,
)

cols_to_remove = [
  "pclass",
  "sibsp",
  "parch",
  "embarked_Q",
]

rf3 = RandomForestClassifier(random_state=42)
rf3.fit(
  X_train[
    [
      c
      for c in X_train.columns
      if c not in cols_to_remove
    ]
  ],
  y_train,
)

rf3.score(
  X_test[
    [
      c
      for c in X_train.columns
      if c not in cols_to_remove
    ]
  ],
  y_test,
)
0.7608142493638677

8.2 라소 회귀

from sklearn import linear_model
model = linear_model.LassoLarsCV(
  cv=10, max_n_alphas=10
).fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(12, 8))
cm = iter(
  plt.get_cmap("tab20")(
    np.linspace(0, 1, X.shape[1])
  )
)

for i in range(X.shape[1]):
  c = next(cm)
  ax.plot(
    model.alphas_,
    model.coef_path_.T[:, i],
    c=c,
    alpha=0.8,
    label=X.columns[i],
  )
    
ax.axvline(
  model.alpha_,
  linestyle="-",
  c="k",
  label="alphaCV",
)

plt.ylabel("Regression Coefficients")
ax.legend(X.columns, bbox_to_anchor=(1, 1))
plt.xlabel("alpha")
plt.title(
  "Regression Coefficients Progression for Lasso Paths"
)
Text(0.5,1,'Regression Coefficients Progression for Lasso Paths')

재귀적 특징 제거

from yellowbrick.features import RFECV
fig, ax = plt.subplots(figsize=(6, 4))
rfe = RFECV(
  ensemble.RandomForestClassifier(
    n_estimators=100
  ),
  cv=5,
)

rfe.fit(X, y)
rfe.rfe_estimator_.ranking_
array([1, 1, 2, 3, 1, 1, 5, 4])
rfe.rfe_estimator_.n_features_
4
rfe.rfe_estimator_.support_
array([ True,  True, False, False,  True,  True, False, False])
rfe.poof()
from sklearn.feature_selection import RFE
model = ensemble.RandomForestClassifier(
    n_estimators=100
)
rfe = RFE(model, 4)
rfe.fit(X, y)
X.columns[rfe.support_]
Index(['pclass', 'age', 'fare', 'sex_male'], dtype='object')

8.4 상호 정보량

from sklearn import feature_selection

mic = feature_selection.mutual_info_classif(X, y)

fig, ax = plt.subplots(figsize=(10, 8))
(
  pd.DataFrame(
    {"feature": X.columns, "vimp": mic}
  )
  .set_index("feature")
  .plot.barh(ax=ax)
)
<matplotlib.axes._subplots.AxesSubplot at 0x7f5ed5e6b0d0>