6.1 데이터의 크기

X.shape
(1309, 8)

6.2 요약 통계

X.describe().iloc[:, [0, -1]]
pclass embarked_S
count 1309.000000 1309.000000
mean -0.012831 0.698243
std 0.995822 0.459196
min -1.551881 0.000000
25% -0.363317 0.000000
50% 0.825248 1.000000
75% 0.825248 1.000000
max 0.825248 1.000000

6.3 히스토그램

fig, ax = plt.subplots(figsize=(6, 4))
X.fare.plot(kind="hist", ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd70c5eae10>
import seaborn as sns

fig, ax = plt.subplots(figsize=(12, 8))
mask = y_train == 1
ax = sns.distplot(X_train[mask].fare, label='survived')
ax = sns.distplot(X_train[~mask].fare, label='died')
ax.set_xlim(-1.5, 1.5)
ax.legend()
<matplotlib.legend.Legend at 0x7fd70c5eaa10>

6.4 산점도

fig, ax = plt.subplots(figsize=(6, 4))
X.plot.scatter(x="age", y="fare", ax=ax, alpha=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd6face1d10>
X.age.corr(X.fare)
0.1771997483998958

6.5 조인트 플롯

from yellowbrick.features import (
  JointPlotVisualizer,
)

fig, ax = plt.subplots(figsize=(6, 6))
jpv = JointPlotVisualizer(feature="age", target="fare")
jpv.fit(X["age"], X["fare"])
jpv.poof()
from seaborn import jointplot
fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
p = jointplot("age", "fare", data=new_df, kind="reg")

6.6 쌍 격자

from seaborn import pairplot
fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
vars = ["pclass", "age", "fare"]
p = pairplot(new_df, vars=vars, hue="target", kind="reg")

6.7 박스 플롯과 바이올린 플롯

from seaborn import boxplot

fig, ax = plt.subplots(figsize=(8, 6))
new_df = X.copy()
new_df["target"] = y
boxplot(x="target", y="age", data=new_df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd6fbbe6810>
from seaborn import violinplot

fig, ax = plt.subplots(figsize=(8, 6))
new_df = X.copy()
new_df["target"] = y
violinplot(x="target", y="sex_male", data=new_df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd6fbb7d910>

6.8 두 순서형 값의 비교

fig, ax = plt.subplots(figsize=(8, 6))
(
  X.assign(
    age_bin=pd.qcut(X.age, q=10, labels=False),
    class_bin=pd.cut(X.pclass, bins=3, labels=False),
  )
  .groupby(["age_bin", "class_bin"])
  .size()
  .unstack()
  .pipe(lambda df: df.div(df.sum(1), axis=0))
  .plot.bar(stacked=True, width=1, ax=ax, cmap="viridis")
  .legend(bbox_to_anchor=(1, 1))
)
<matplotlib.legend.Legend at 0x7fd6faa4c610>

6.9 상관관계

from yellowbrick.features import Rank2D
fig, ax = plt.subplots(figsize=(6, 6))
pcv = Rank2D(features=X.columns, algorithm="pearson")
pcv.fit(X, y)
pcv.transform(X)
pcv.poof()
from seaborn import heatmap

fig, ax = plt.subplots(figsize=(8, 8))
ax = heatmap(
  X.corr(),
  fmt=".2f",
  annot=True,
  ax=ax,
  cmap="RdBu_r",
  vmin=-1,
  vmax=1,
)
X.corr().iloc[:, :2]
pclass age
pclass 1.000000 -0.439704
age -0.439704 1.000000
sibsp 0.060832 -0.292056
parch 0.018322 -0.176447
fare -0.558827 0.177200
sex_male 0.124617 0.065004
embarked_Q 0.230491 -0.053904
embarked_S 0.096335 -0.045361
import numpy as np

def correlated_columns(df, threshold=0.95):
  return (
    df.corr().pipe(
      lambda df1: pd.DataFrame(
        np.tril(df1, k=-1), columns=df.columns, index=df.columns
      )
    )
    .stack()
    .rename("pearson")
    .pipe(
      lambda s: s[
        s.abs() > threshold
      ].reset_index()
    )
    .query("level_0 not in level_1")
  )

correlated_columns(X)
level_0 level_1 pearson

6.10 라드비즈

from yellowbrick.features import RadViz

fig, ax = plt.subplots(figsize=(6, 6))
rv = RadViz(
  classes=["died", "survived"],
  features=X.columns,
)

rv.fit(X, y)
_ = rv.transform(X)
rv.poof()
from pandas.plotting import radviz

fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
radviz(new_df, "target", ax=ax, colormap="PiYG")
<matplotlib.axes._subplots.AxesSubplot at 0x7fd6fa9b63d0>

6.11 평행 좌표

from yellowbrick.features import (
  ParallelCoordinates,
)

fig, ax = plt.subplots(figsize=(6, 4))

pc = ParallelCoordinates(
  classes=["died", "survived"],
  features=X.columns,
)
pc.fit(X, y)
pc.transform(X)

ax.set_xticklabels(
  ax.get_xticklabels(), rotation=45
)

pc.poof()
from pandas.plotting import (
  parallel_coordinates,
)

fig, ax = plt.subplots(figsize=(6, 4))

new_df = X.copy()
new_df["target"] = y
parallel_coordinates(
  new_df,
  "target",
  ax=ax,
  colormap="viridis",
  alpha=0.5,
)

ax.set_xticklabels(
  ax.get_xticklabels(), rotation=45
)
[Text(0,0,'pclass'),
 Text(0,0,'age'),
 Text(0,0,'sibsp'),
 Text(0,0,'parch'),
 Text(0,0,'fare'),
 Text(0,0,'sex_male'),
 Text(0,0,'embarked_Q'),
 Text(0,0,'embarked_S')]