19장 파이프라인
파이프라인 기능을 알아봅니다.
import pandas as pd
from sklearn.experimental import (
enable_iterative_imputer,
)
from sklearn import (
ensemble,
impute,
model_selection,
preprocessing,
tree,
)
from sklearn.base import (
BaseEstimator,
TransformerMixin,
)
from sklearn.ensemble import (
RandomForestClassifier,
)
from sklearn.pipeline import Pipeline
def tweak_titanic(df):
df = df.drop(
columns=[
"name",
"ticket",
"home.dest",
"boat",
"body",
"cabin",
]
).pipe(pd.get_dummies, drop_first=True)
return df
class TitanicTransformer(
BaseEstimator, TransformerMixin
):
def transform(self, X):
# assumes X is output
# from reading Excel file
X = tweak_titanic(X)
X = X.drop(columns="survived")
return X
def fit(self, X, y):
return self
pipe = Pipeline(
[
("titan", TitanicTransformer()),
("impute", impute.IterativeImputer()),
(
"std",
preprocessing.StandardScaler(),
),
("rf", RandomForestClassifier()),
]
)
from sklearn.model_selection import (
train_test_split,
)
url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
orig_df = df
from sklearn.model_selection import (
train_test_split,
)
X_train2, X_test2, y_train2, y_test2 = train_test_split(
orig_df,
orig_df.survived,
test_size=0.3,
random_state=42,
)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)
params = {
"rf__max_features": [0.4, "auto"],
"rf__n_estimators": [15, 200],
}
grid = model_selection.GridSearchCV(
pipe, cv=3, param_grid=params
)
grid.fit(orig_df, orig_df.survived)
grid.best_params_
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)
from sklearn import metrics
metrics.roc_auc_score(
y_test2, pipe.predict(X_test2)
)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
reg_pipe = Pipeline(
[
(
"std",
preprocessing.StandardScaler(),
),
("lr", LinearRegression()),
]
)
reg_pipe.fit(bos_X_train, bos_y_train)
reg_pipe.score(bos_X_test, bos_y_test)
reg_pipe.named_steps["lr"].intercept_
reg_pipe.named_steps["lr"].coef_
metrics.mean_squared_error(
bos_y_test, reg_pipe.predict(bos_X_test)
)
from sklearn.decomposition import PCA
pca_pipe = Pipeline(
[
(
"std",
preprocessing.StandardScaler(),
),
("pca", PCA()),
]
)
X_pca = pca_pipe.fit_transform(X)
pca_pipe.named_steps["pca"].explained_variance_ratio_
pca_pipe.named_steps["pca"].components_[0]