3장 분류 문제 둘러보기(타이타닉 데이터셋)
타이타닉 데이터셋으로 머신러닝의 전반적인 내용을 살펴봅니다.
- 3.2 필요한 패키지
- 3.5 데이터의 수집
- 3.6 데이터의 정리
- 3.7 특징의 생성
- 3.8 샘플 데이터
- 3.9 데이터의 대치
- 3.10 데이터의 표준화
- 3.11 리팩터링
- 3.12 베이스라인 모델
- 3.13 다양한 알고리즘
- 3.14 스태킹
- 3.15 모델 만들기
- 3.16 모델의 평가
- 3.17 모델의 최적화
- 3.18 오차 행렬
- 3.19 ROC 곡선
- 3.20 학습 곡선
- 3.21 모델의 배포
url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
orig_df = df
df.columns
df.dtypes
df.shape
df.describe().iloc[:, :2]
df.isnull().sum()
mask = df.isnull().any(axis=1)
mask.head()
df[mask].body.head()
df.sex.value_counts(dropna=False)
df.embarked.value_counts(dropna=False)
name = df.name
name.head(3)
df = df.drop(
columns=["name",
"ticket",
"home.dest",
"boat",
"body",
"cabin"]
)
df = pd.get_dummies(df)
df.columns
df = df.drop(columns="sex_male")
df = pd.get_dummies(df, drop_first=True)
df.columns
y = df.survived
X = df.drop(columns="survived")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42)
from sklearn.experimental import (
enable_iterative_imputer,
)
from sklearn import impute
num_cols = [
"pclass",
"age",
"sibsp",
"parch",
"fare",
"sex_female",
]
imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(
X_train[num_cols]
)
X_train.loc[:, num_cols] = imputed
imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed
meds = X_train.median()
X_train = X_train.fillna(meds)
X_test = X_test.fillna(meds)
X_train.head()
cols = "pclass,age,sibsp,fare".split(",")
sca = preprocessing.StandardScaler()
X_train = sca.fit_transform(X_train)
X_train = pd.DataFrame(X_train[:, :4], columns=cols)
X_test = sca.transform(X_test)
X_test = pd.DataFrame(X_test[:, :4], columns=cols)
def tweak_titanic(df):
df = df.drop(
columns=[
"name",
"ticket",
"home.dest",
"boat",
"body",
"cabin",
]
).pipe(pd.get_dummies, drop_first=True)
return df
def get_train_test_X_y(df, y_col, size=0.3, std_cols=None):
y = df[y_col]
X = df.drop(columns=y_col)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=size, random_state=42
)
cols = X.columns
num_cols = [
"pclass",
"age",
"sibsp",
"parch",
"fare",
]
fi = impute.IterativeImputer()
X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])
if std_cols:
std = preprocessing.StandardScaler()
X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
X_test.loc[:, std_cols] = std.transform(X_test[std_cols])
return X_train, X_test, y_train, y_test
ti_df = tweak_titanic(orig_df)
std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, "survived", std_cols=std_cols)
from sklearn.dummy import DummyClassifier
bm = DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test) # 정확도
from sklearn import metrics
metrics.precision_score(y_test, bm.predict(X_test))
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import (
LogisticRegression,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import (
KNeighborsClassifier,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (
RandomForestClassifier,
)
import xgboost
for model in [
DummyClassifier,
LogisticRegression,
DecisionTreeClassifier,
KNeighborsClassifier,
GaussianNB,
SVC,
RandomForestClassifier,
xgboost.XGBClassifier,
]:
cls = model()
kfold = model_selection.KFold(n_splits=10, random_state=42)
s = model_selection.cross_val_score(cls, X, y, scoring="roc_auc", cv=kfold)
print(f"{model.__name__:22} AUC: {s.mean():.3f} STD: {s.std():.2f}")
from mlxtend.classifier import (
StackingClassifier,
)
clfs = [
x()
for x in [
LogisticRegression,
DecisionTreeClassifier,
KNeighborsClassifier,
GaussianNB,
SVC,
RandomForestClassifier,
]
]
stack = StackingClassifier(
classifiers=clfs,
meta_classifier=LogisticRegression(),
)
kfold = model_selection.KFold(n_splits=10, random_state=42)
s = model_selection.cross_val_score(stack, X, y, scoring="roc_auc", cv=kfold)
print(f"{stack.__class__.__name__} AUC: {s.mean():.3f} STD: {s.std():.2f}")
rf = ensemble.RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
metrics.precision_score(y_test, rf.predict(X_test))
for col, val in sorted(zip(X_train.columns,
rf.feature_importances_),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
rf4 = ensemble.RandomForestClassifier()
params = {
"max_features": [0.4, "auto"],
"n_estimators": [15, 200],
"min_samples_leaf": [1, 0.1],
"random_state": [42],
}
cv = model_selection.GridSearchCV(rf4, params, n_jobs=-1).fit(X_train, y_train)
print(cv.best_params_)
rf5 = ensemble.RandomForestClassifier(
**{
"max_features": "auto",
"min_samples_leaf": 0.1,
"n_estimators": 200,
"random_state": 42,
}
)
rf5.fit(X_train, y_train)
rf5.score(X_test, y_test)
from sklearn.metrics import confusion_matrix
y_pred = rf5.predict(X_test)
confusion_matrix(y_test, y_pred)
mapping = {0: "died", 1: "survived"}
fig, ax = plt.subplots(figsize=(6, 6))
cm_viz = ConfusionMatrix(
rf5,
classes=["died", "survived"],
label_encoder=mapping,
)
cm_viz.score(X_test, y_test)
cm_viz.poof()
plt.show()
y_pred = rf5.predict(X_test)
roc_auc_score(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6, 6))
roc_viz = ROCAUC(rf5)
roc_viz.score(X_test, y_test)
roc_viz.poof()
plt.show()
import numpy as np
fig, ax = plt.subplots(figsize=(6, 4))
cv = StratifiedKFold(12)
sizes = np.linspace(0.3, 1.0, 10)
lc_viz = LearningCurve(
rf5,
cv=cv,
train_sizes=sizes,
scoring="f1_weighted",
n_jobs=4,
ax=ax,
)
lc_viz.fit(X, y)
lc_viz.poof()
plt.show()
import pickle
pic = pickle.dumps(rf5)
rf6 = pickle.loads(pic)
y_pred = rf6.predict(X_test)
roc_auc_score(y_test, y_pred)