14장 회귀
회귀를 다룹니다.
- 데이터 준비
- 14.1 베이스라인 모델
- 14.2 선형 회귀
- 14.3 SVM
- 14.4 K-최근접 이웃
- 14.5 디시전 트리
- 14.6 랜덤 포레스트
- 14.7 XGBoost 회귀
- 14.8 LightGBM 회귀 분석
import pandas as pd
from sklearn.datasets import load_boston
from sklearn import (
model_selection,
preprocessing,
)
b = load_boston()
bos_X = pd.DataFrame(
b.data, columns=b.feature_names
)
bos_y = b.target
bos_X_train, bos_X_test, bos_y_train, bos_y_test = model_selection.train_test_split(
bos_X,
bos_y,
test_size=0.3,
random_state=42,
)
bos_sX = preprocessing.StandardScaler().fit_transform(bos_X)
bos_sX_train, bos_sX_test, bos_sy_train, bos_sy_test = model_selection.train_test_split(
bos_sX,
bos_y,
test_size=0.3,
random_state=42,
)
from sklearn.dummy import DummyRegressor
dr = DummyRegressor()
dr.fit(bos_X_train, bos_y_train)
dr.score(bos_X_test, bos_y_test)
from sklearn.linear_model import (
LinearRegression,
)
lr = LinearRegression()
lr.fit(bos_X_train, bos_y_train)
lr.score(bos_X_test, bos_y_test)
lr.coef_
lr2 = LinearRegression()
lr2.fit(bos_sX_train, bos_sy_train)
lr2.score(bos_sX_test, bos_sy_test)
lr2.intercept_
lr2.coef_
import matplotlib.pyplot as plt
from yellowbrick.features import (
FeatureImportances,
)
fig, ax = plt.subplots(figsize=(10, 8))
fi_viz = FeatureImportances(
lr2, labels=bos_X.columns
)
fi_viz.fit(bos_sX, bos_y)
fi_viz.poof()
from sklearn.svm import SVR
svr = SVR()
svr.fit(bos_sX_train, bos_sy_train)
svr.score(bos_sX_test, bos_sy_test)
from sklearn.neighbors import (
KNeighborsRegressor,
)
knr = KNeighborsRegressor()
knr.fit(bos_sX_train, bos_sy_train)
knr.score(bos_sX_test, bos_sy_test)
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(bos_X_train, bos_y_train)
dtr.score(bos_X_test, bos_y_test)
import pydotplus
from IPython.display import Image
from io import StringIO
from sklearn.tree import export_graphviz
dot_data = StringIO()
export_graphviz(
dtr,
out_file=dot_data,
feature_names=bos_X.columns,
filled=True,
)
g = pydotplus.graph_from_dot_data(
dot_data.getvalue()
)
Image(g.create_png())
from IPython.display import Image
dot_data = StringIO()
export_graphviz(
dtr,
max_depth=2,
out_file=dot_data,
feature_names=bos_X.columns,
filled=True,
)
g = pydotplus.graph_from_dot_data(
dot_data.getvalue()
)
Image(g.create_png())
from dtreeviz.trees import dtreeviz
dtr3 = DecisionTreeRegressor(max_depth=2)
dtr3.fit(bos_X_train, bos_y_train)
viz = dtreeviz(
dtr3,
bos_X,
bos_y,
target_name="price",
feature_names=bos_X.columns,
scale=(2.5)
)
viz
for col, val in sorted(zip(bos_X.columns, dtr.feature_importances_),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
from sklearn.ensemble import (
RandomForestRegressor,
)
rfr = RandomForestRegressor(
random_state=42, n_estimators=100
)
rfr.fit(bos_X_train, bos_y_train)
rfr.score(bos_X_test, bos_y_test)
for col, val in sorted(zip(bos_X.columns, rfr.feature_importances_),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
import xgboost as xgb
xgr = xgb.XGBRegressor(random_state=42)
xgr.fit(bos_X_train, bos_y_train)
xgr.score(bos_X_test, bos_y_test)
xgr.predict(bos_X.iloc[[0]])
for col, val in sorted(zip(bos_X.columns, xgr.feature_importances_),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
fig, ax = plt.subplots(figsize=(10, 8))
xgb.plot_importance(xgr, ax=ax)
fig, ax = plt.subplots(figsize=(10, 8))
fi_viz = FeatureImportances(xgr)
fi_viz.fit(bos_X_train, bos_y_train)
fi_viz.poof()
booster = xgr.get_booster()
print(booster.get_dump()[0])
fig, ax = plt.subplots(figsize=(30, 20))
xgb.plot_tree(xgr, ax=ax, num_trees=0)
import lightgbm as lgb
lgr = lgb.LGBMRegressor(random_state=42)
lgr.fit(bos_X_train, bos_y_train)
lgr.score(bos_X_test, bos_y_test)
lgr.predict(bos_X.iloc[[0]])
for col, val in sorted(zip(bos_X.columns, lgr.feature_importances_),
key=lambda x: x[1],
reverse=True)[:5]:
print(f"{col:10}{val:10.3f}")
fig, ax = plt.subplots(figsize=(10, 8))
lgb.plot_importance(lgr, ax=ax)
fig.tight_layout()