10.1 로지스틱 회귀

from sklearn.linear_model import (
  LogisticRegression,
)

lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
lr.score(X_test, y_test)
0.8015267175572519
lr.predict(X.iloc[[0]])
array([0])
lr.predict_proba(X.iloc[[0]])
array([[0.89709139, 0.10290861]])
lr.predict_log_proba(X.iloc[[0]])
array([[-0.10859754, -2.27391397]])
lr.decision_function(X.iloc[[0]])
array([-2.16531643])
lr.intercept_
array([1.22466932])
import numpy as np

def inv_logit(p):
  return np.exp(p) / (1 + np.exp(p))

inv_logit(lr.intercept_)
array([0.77288422])
cols = X.columns
for col, val in sorted(zip(cols, lr.coef_[0]),
                       key=lambda x: x[1],
                       reverse=True):
  print(f"{col:10}{val:10.3f} {inv_logit(val):10.3f}")
fare           0.104      0.526
parch         -0.063      0.484
sibsp         -0.273      0.432
age           -0.295      0.427
embarked_Q    -0.495      0.379
embarked_S    -0.508      0.376
pclass        -0.738      0.323
sex_male      -2.408      0.083
from yellowbrick.features.importances import (
  FeatureImportances,
)

fig, ax = plt.subplots(figsize=(6, 4))
fi_viz = FeatureImportances(lr)
fi_viz.fit(X, y)
fi_viz.poof()

10.2 나이브 베이즈

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
GaussianNB(priors=None, var_smoothing=1e-09)
nb.score(X_test, y_test)
0.7557251908396947
nb.predict(X.iloc[[0]])
array([0])
nb.predict_proba(X.iloc[[0]])
array([[0.95955327, 0.04044673]])
nb.predict_log_proba(X.iloc[[0]])
array([[-0.04128744, -3.20776959]])

10.3 서포트 벡터 머신

from sklearn.svm import SVC
svc = SVC(random_state=42, probability=True)
svc.fit(X_train, y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=True, random_state=42,
    shrinking=True, tol=0.001, verbose=False)
svc.score(X_test, y_test)
0.806615776081425
svc.predict(X.iloc[[0]])
array([0])
svc.predict_proba(X.iloc[[0]])
array([[0.84850738, 0.15149262]])
svc.predict_log_proba(X.iloc[[0]])
array([[-0.1642765 , -1.88721835]])

10.4 K-최근접 이웃

from sklearn.neighbors import (
  KNeighborsClassifier,
)

knc = KNeighborsClassifier()
knc.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
knc.score(X_test, y_test)
0.7684478371501272
knc.predict(X.iloc[[0]])
array([0])
knc.predict_proba(X.iloc[[0]])
array([[0.8, 0.2]])

10.5 디시전 트리

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(
  random_state=42, max_depth=3
)
dt.fit(X_train, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')
dt.score(X_test, y_test)
0.8142493638676844
dt.predict(X.iloc[[0]])
array([0])
dt.predict_proba(X.iloc[[0]])
array([[0.87954545, 0.12045455]])
dt.predict_log_proba(X.iloc[[0]])
array([[-0.12835003, -2.11648281]])
import pydotplus
from io import StringIO
from sklearn.tree import export_graphviz
from IPython.display import Image

dot_data = StringIO()

tree.export_graphviz(
  dt,
  out_file=dot_data,
  feature_names=X.columns,
  class_names=["Died", "Survived"],
  filled=True,
)

g = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(g.create_png())
from dtreeviz.trees import dtreeviz

viz = dtreeviz(
  dt,
  X,
  y,
  target_name="survived",
  feature_names=X.columns,
  class_names=["died", "survived"],
)

viz
G cluster_legend node2 node5 leaf3 node2->leaf3 leaf4 node2->leaf4 leaf6 node5->leaf6 leaf7 node5->leaf7 node1 node1->node2 node1->node5 node8 node9 node12 leaf10 node9->leaf10 leaf11 node9->leaf11 leaf13 node12->leaf13 leaf14 node12->leaf14 node8->node9 node8->node12 node0 node0->node1 < node0->node8 legend
for col, val in sorted(zip(X.columns, dt.feature_importances_),
                       key=lambda x: x[1],
                       reverse=True)[:5]:
  print(f"{col:10}{val:10.3f}")
sex_male       0.607
pclass         0.248
sibsp          0.052
fare           0.050
age            0.043
from yellowbrick.features.importances import (
  FeatureImportances,
)

fig, ax = plt.subplots(figsize=(6, 4))
fi_viz = FeatureImportances(dt)
fi_viz.fit(X, y)
fi_viz.poof()

10.6 랜덤포레스트

from sklearn.ensemble import (
  RandomForestClassifier,
)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
rf.score(X_test, y_test)
0.7659033078880407
rf.predict(X.iloc[[0]])
array([0])
rf.predict_proba(X.iloc[[0]])
array([[1., 0.]])
rf.predict_log_proba(X.iloc[[0]])
array([[  0., -inf]])
for col, val in sorted(zip(X.columns, rf.feature_importances_),
                       key=lambda x: x[1],
                       reverse=True)[:5]:
  print(f"{col:10}{val:10.3f}")
age            0.280
fare           0.274
sex_male       0.230
pclass         0.086
sibsp          0.053

10.7 XGBoost

import xgboost as xgb
xgb_class = xgb.XGBClassifier(random_state=42)
xgb_class.fit(
  X_train,
  y_train,
  early_stopping_rounds=10,
  eval_set=[(X_test, y_test)],
)
[0]	validation_0-error:0.188295
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.188295
[2]	validation_0-error:0.188295
[3]	validation_0-error:0.188295
[4]	validation_0-error:0.188295
[5]	validation_0-error:0.188295
[6]	validation_0-error:0.203562
[7]	validation_0-error:0.203562
[8]	validation_0-error:0.203562
[9]	validation_0-error:0.203562
[10]	validation_0-error:0.203562
Stopping. Best iteration:
[0]	validation_0-error:0.188295

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
xgb_class.score(X_test, y_test)
0.811704834605598
xgb_class.predict(X.iloc[[0]])
array([0])
xgb_class.predict_proba(X.iloc[[0]])
array([[0.53754187, 0.46245816]], dtype=float32)
for col, val in sorted(zip(X.columns, xgb_class.feature_importances_,),
                       key=lambda x: x[1],
                       reverse=True)[:5]:
  print(f"{col:10}{val:10.3f}")
sex_male       0.665
pclass         0.155
sibsp          0.069
embarked_S     0.042
age            0.038
fig, ax = plt.subplots(figsize=(6, 4))
xgb.plot_importance(xgb_class, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1d6027b50>
fig, ax = plt.subplots(figsize=(6, 4))
fi_viz = FeatureImportances(xgb_class)
fi_viz.fit(X, y)
fi_viz.poof()
booster = xgb_class.get_booster()
print(booster.get_dump()[0])
0:[sex_male<1] yes=1,no=2,missing=1
	1:[pclass<0.23096557] yes=3,no=4,missing=3
		3:[fare<-0.142878294] yes=7,no=8,missing=7
			7:leaf=0.132530123
			8:leaf=0.184
		4:[fare<-0.195437849] yes=9,no=10,missing=9
			9:leaf=0.0245989319
			10:leaf=-0.145945951
	2:[age<-1.49289274] yes=5,no=6,missing=5
		5:[sibsp<1.81278062] yes=11,no=12,missing=11
			11:leaf=0.135483876
			12:leaf=-0.150000006
		6:[pclass<-0.957598865] yes=13,no=14,missing=13
			13:leaf=-0.0666666701
			14:leaf=-0.148717955

1 / (1 + np.exp(-1 * 0.1238))
0.5309105310475829
fig, ax = plt.subplots(figsize=(50, 50))
xgb.plot_tree(xgb_class, ax=ax, num_trees=0)
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1cd19e250>
import xgbfir

xgbfir.saveXgbFI(
  xgb_class,
  feature_names=X.columns,
  OutputXlsxFile="fir.xlsx",
)
pd.read_excel("/content/fir.xlsx").head(3).T
0 1 2
Interaction sex_male pclass fare
Gain 2026.35 743.409 711.231
FScore 48 53 279
wFScore 39.4676 22.203 121.88
Average wFScore 0.822241 0.418924 0.436845
Average Gain 42.2157 14.0266 2.54922
Expected Gain 2016.01 298.442 291.862
Gain Rank 1 2 3
FScore Rank 4 3 1
wFScore Rank 3 4 1
Avg wFScore Rank 1 7 6
Avg Gain Rank 1 2 5
Expected Gain Rank 1 2 3
Average Rank 1.83333 3.33333 3.16667
Average Tree Index 36.8333 22.9434 54.3405
Average Tree Depth 0.375 1.20755 1.42652
pd.read_excel(
  "fir.xlsx",
  sheet_name="Interaction Depth 1",
).head(2).T
0 1
Interaction pclass|sex_male age|sex_male
Gain 3301.86 1378.75
FScore 38 15
wFScore 15.9816 8.5093
Average wFScore 0.420569 0.567287
Average Gain 86.8911 91.9168
Expected Gain 1375.25 885.229
Gain Rank 1 2
FScore Rank 4 12
wFScore Rank 4 9
Avg wFScore Rank 11 6
Avg Gain Rank 2 1
Expected Gain Rank 1 2
Average Rank 3.83333 5.33333
Average Tree Index 15.9474 32.4
Average Tree Depth 1.02632 1
pd.read_excel(
  "fir.xlsx",
  sheet_name="Interaction Depth 2",
).head(1).T
0
Interaction fare|pclass|sex_male
Gain 4891.87
FScore 44
wFScore 7.8619
Average wFScore 0.178679
Average Gain 111.179
Expected Gain 870.56
Gain Rank 1
FScore Rank 1
wFScore Rank 5
Avg wFScore Rank 31
Avg Gain Rank 2
Expected Gain Rank 2
Average Rank 7
Average Tree Index 16.8864
Average Tree Depth 2
pd.read_excel(
  "fir.xlsx",
  sheet_name="Interaction Depth 2",
)[["Interaction", "Gain"]].head()
Interaction Gain
0 fare|pclass|sex_male 4891.867318
1 age|pclass|sex_male 2999.230953
2 age|sex_male|sibsp 1518.797561
3 age|fare|sex_male 334.379201
4 embarked_S|pclass|sex_male 225.302789

10.8 LightGBM을 사용한 그래디언트 부스팅

import lightgbm as lgb
lgbm_class = lgb.LGBMClassifier(
  random_state=42
)
lgbm_class.fit(X_train, y_train)
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
lgbm_class.score(X_test, y_test)
0.806615776081425
lgbm_class.predict(X.iloc[[0]])
array([0])
lgbm_class.predict_proba(X.iloc[[0]])
array([[0.98090161, 0.01909839]])
for col, val in sorted(zip(cols, lgbm_class.feature_importances_),
                       key=lambda x: x[1],
                       reverse=True)[:5]:
  print(f"{col:10}{val:10.3f}")
fare        1285.000
age         1198.000
sex_male     113.000
pclass       112.000
sibsp         99.000
fig, ax = plt.subplots(figsize=(6, 4))
lgb.plot_importance(lgbm_class, ax=ax)
fig.tight_layout()
fig, ax = plt.subplots(figsize=(200, 200))
lgb.plot_tree(lgbm_class, tree_index=0, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1ccec1050>

10.9 TPOT

from tpot import TPOTClassifier
tc = TPOTClassifier(generations=2)
tc.fit(X_train, y_train)
tc.score(X_test, y_test)
/usr/local/lib/python3.7/dist-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=DeprecationWarning)
Version 0.9.5 of tpot is outdated. Version 0.11.7 was released Wednesday January 06, 2021.
0.7888040712468194
tc.predict(X.iloc[[0]])
array([0])
tc.predict_proba(X.iloc[[0]])
array([[0.92383425, 0.07616575]])
tc.export("tpot_exported_pipeline.py")
True
!cat ./tpot_exported_pipeline.py
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.8209729151817534
exported_pipeline = RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.5, min_samples_leaf=5, min_samples_split=18, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)