10장 분류
다양한 분류 알고리즘의 사용법을 익힙니다.
- 10.1 로지스틱 회귀
- 10.2 나이브 베이즈
- 10.3 서포트 벡터 머신
- 10.4 K-최근접 이웃
- 10.5 디시전 트리
- 10.6 랜덤포레스트
- 10.7 XGBoost
- 10.8 LightGBM을 사용한 그래디언트 부스팅
- 10.9 TPOT
from sklearn.linear_model import (
LogisticRegression,
)
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)
lr.predict(X.iloc[[0]])
lr.predict_proba(X.iloc[[0]])
lr.predict_log_proba(X.iloc[[0]])
lr.decision_function(X.iloc[[0]])
lr.intercept_
import numpy as np
def inv_logit(p):
return np.exp(p) / (1 + np.exp(p))
inv_logit(lr.intercept_)
cols = X.columns
for col, val in sorted(zip(cols, lr.coef_[0]),
key=lambda x: x[1],
reverse=True):
print(f"{col:10}{val:10.3f} {inv_logit(val):10.3f}")
from yellowbrick.features.importances import (
FeatureImportances,
)
fig, ax = plt.subplots(figsize=(6, 4))
fi_viz = FeatureImportances(lr)
fi_viz.fit(X, y)
fi_viz.poof()
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)
nb.predict(X.iloc[[0]])
nb.predict_proba(X.iloc[[0]])
nb.predict_log_proba(X.iloc[[0]])
from sklearn.svm import SVC
svc = SVC(random_state=42, probability=True)
svc.fit(X_train, y_train)
svc.score(X_test, y_test)
svc.predict(X.iloc[[0]])
svc.predict_proba(X.iloc[[0]])
svc.predict_log_proba(X.iloc[[0]])
from sklearn.neighbors import (
KNeighborsClassifier,
)
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)
knc.score(X_test, y_test)
knc.predict(X.iloc[[0]])
knc.predict_proba(X.iloc[[0]])
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(
random_state=42, max_depth=3
)
dt.fit(X_train, y_train)
dt.score(X_test, y_test)
dt.predict(X.iloc[[0]])
dt.predict_proba(X.iloc[[0]])
dt.predict_log_proba(X.iloc[[0]])
import pydotplus
from io import StringIO
from sklearn.tree import export_graphviz
from IPython.display import Image
dot_data = StringIO()
tree.export_graphviz(
dt,
out_file=dot_data,
feature_names=X.columns,
class_names=["Died", "Survived"],
filled=True,
)
g = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(g.create_png())
from dtreeviz.trees import dtreeviz
viz = dtreeviz(
dt,
X,
y,
target_name="survived",
feature_names=X.columns,
class_names=["died", "survived"],
)
viz
for col, val in sorted(zip(X.columns, dt.feature_importances_),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
from yellowbrick.features.importances import (
FeatureImportances,
)
fig, ax = plt.subplots(figsize=(6, 4))
fi_viz = FeatureImportances(dt)
fi_viz.fit(X, y)
fi_viz.poof()
from sklearn.ensemble import (
RandomForestClassifier,
)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
rf.predict(X.iloc[[0]])
rf.predict_proba(X.iloc[[0]])
rf.predict_log_proba(X.iloc[[0]])
for col, val in sorted(zip(X.columns, rf.feature_importances_),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
import xgboost as xgb
xgb_class = xgb.XGBClassifier(random_state=42)
xgb_class.fit(
X_train,
y_train,
early_stopping_rounds=10,
eval_set=[(X_test, y_test)],
)
xgb_class.score(X_test, y_test)
xgb_class.predict(X.iloc[[0]])
xgb_class.predict_proba(X.iloc[[0]])
for col, val in sorted(zip(X.columns, xgb_class.feature_importances_,),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
fig, ax = plt.subplots(figsize=(6, 4))
xgb.plot_importance(xgb_class, ax=ax)
fig, ax = plt.subplots(figsize=(6, 4))
fi_viz = FeatureImportances(xgb_class)
fi_viz.fit(X, y)
fi_viz.poof()
booster = xgb_class.get_booster()
print(booster.get_dump()[0])
1 / (1 + np.exp(-1 * 0.1238))
fig, ax = plt.subplots(figsize=(50, 50))
xgb.plot_tree(xgb_class, ax=ax, num_trees=0)
import xgbfir
xgbfir.saveXgbFI(
xgb_class,
feature_names=X.columns,
OutputXlsxFile="fir.xlsx",
)
pd.read_excel("/content/fir.xlsx").head(3).T
pd.read_excel(
"fir.xlsx",
sheet_name="Interaction Depth 1",
).head(2).T
pd.read_excel(
"fir.xlsx",
sheet_name="Interaction Depth 2",
).head(1).T
pd.read_excel(
"fir.xlsx",
sheet_name="Interaction Depth 2",
)[["Interaction", "Gain"]].head()
import lightgbm as lgb
lgbm_class = lgb.LGBMClassifier(
random_state=42
)
lgbm_class.fit(X_train, y_train)
lgbm_class.score(X_test, y_test)
lgbm_class.predict(X.iloc[[0]])
lgbm_class.predict_proba(X.iloc[[0]])
for col, val in sorted(zip(cols, lgbm_class.feature_importances_),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
fig, ax = plt.subplots(figsize=(6, 4))
lgb.plot_importance(lgbm_class, ax=ax)
fig.tight_layout()
fig, ax = plt.subplots(figsize=(200, 200))
lgb.plot_tree(lgbm_class, tree_index=0, ax=ax)
from tpot import TPOTClassifier
tc = TPOTClassifier(generations=2)
tc.fit(X_train, y_train)
tc.score(X_test, y_test)
tc.predict(X.iloc[[0]])
tc.predict_proba(X.iloc[[0]])
tc.export("tpot_exported_pipeline.py")
!cat ./tpot_exported_pipeline.py