6.1 데이터의 크기

X.shape

(1309, 8)

6.2 요약 통계

X.describe().iloc[:, [0, -1]]

6.3 히스토그램

fig, ax = plt.subplots(figsize=(6, 4))
X.fare.plot(kind="hist", ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x7fd70c5eae10>

import seaborn as sns

fig, ax = plt.subplots(figsize=(12, 8))
mask = y_train == 1
ax = sns.distplot(X_train[mask].fare, label='survived')
ax = sns.distplot(X_train[~mask].fare, label='died')
ax.set_xlim(-1.5, 1.5)
ax.legend()

<matplotlib.legend.Legend at 0x7fd70c5eaa10>

6.4 산점도

fig, ax = plt.subplots(figsize=(6, 4))
X.plot.scatter(x="age", y="fare", ax=ax, alpha=0.3)

<matplotlib.axes._subplots.AxesSubplot at 0x7fd6face1d10>

X.age.corr(X.fare)

0.1771997483998958

6.5 조인트 플롯

from yellowbrick.features import (
  JointPlotVisualizer,
)

fig, ax = plt.subplots(figsize=(6, 6))
jpv = JointPlotVisualizer(feature="age", target="fare")
jpv.fit(X["age"], X["fare"])
jpv.poof()

from seaborn import jointplot
fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
p = jointplot("age", "fare", data=new_df, kind="reg")

6.6 쌍 격자

from seaborn import pairplot
fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
vars = ["pclass", "age", "fare"]
p = pairplot(new_df, vars=vars, hue="target", kind="reg")

6.7 박스 플롯과 바이올린 플롯

from seaborn import boxplot

fig, ax = plt.subplots(figsize=(8, 6))
new_df = X.copy()
new_df["target"] = y
boxplot(x="target", y="age", data=new_df)

<matplotlib.axes._subplots.AxesSubplot at 0x7fd6fbbe6810>

from seaborn import violinplot

fig, ax = plt.subplots(figsize=(8, 6))
new_df = X.copy()
new_df["target"] = y
violinplot(x="target", y="sex_male", data=new_df)

<matplotlib.axes._subplots.AxesSubplot at 0x7fd6fbb7d910>

6.8 두 순서형 값의 비교

fig, ax = plt.subplots(figsize=(8, 6))
(
  X.assign(
    age_bin=pd.qcut(X.age, q=10, labels=False),
    class_bin=pd.cut(X.pclass, bins=3, labels=False),
  )
  .groupby(["age_bin", "class_bin"])
  .size()
  .unstack()
  .pipe(lambda df: df.div(df.sum(1), axis=0))
  .plot.bar(stacked=True, width=1, ax=ax, cmap="viridis")
  .legend(bbox_to_anchor=(1, 1))
)

<matplotlib.legend.Legend at 0x7fd6faa4c610>

6.9 상관관계

from yellowbrick.features import Rank2D
fig, ax = plt.subplots(figsize=(6, 6))
pcv = Rank2D(features=X.columns, algorithm="pearson")
pcv.fit(X, y)
pcv.transform(X)
pcv.poof()

from seaborn import heatmap

fig, ax = plt.subplots(figsize=(8, 8))
ax = heatmap(
  X.corr(),
  fmt=".2f",
  annot=True,
  ax=ax,
  cmap="RdBu_r",
  vmin=-1,
  vmax=1,
)

X.corr().iloc[:, :2]

import numpy as np

def correlated_columns(df, threshold=0.95):
  return (
    df.corr().pipe(
      lambda df1: pd.DataFrame(
        np.tril(df1, k=-1), columns=df.columns, index=df.columns
      )
    )
    .stack()
    .rename("pearson")
    .pipe(
      lambda s: s[
        s.abs() > threshold
      ].reset_index()
    )
    .query("level_0 not in level_1")
  )

correlated_columns(X)

6.10 라드비즈

from yellowbrick.features import RadViz

fig, ax = plt.subplots(figsize=(6, 6))
rv = RadViz(
  classes=["died", "survived"],
  features=X.columns,
)

rv.fit(X, y)
_ = rv.transform(X)
rv.poof()

from pandas.plotting import radviz

fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
radviz(new_df, "target", ax=ax, colormap="PiYG")

<matplotlib.axes._subplots.AxesSubplot at 0x7fd6fa9b63d0>

6.11 평행 좌표

from yellowbrick.features import (
  ParallelCoordinates,
)

fig, ax = plt.subplots(figsize=(6, 4))

pc = ParallelCoordinates(
  classes=["died", "survived"],
  features=X.columns,
)
pc.fit(X, y)
pc.transform(X)

ax.set_xticklabels(
  ax.get_xticklabels(), rotation=45
)

pc.poof()

from pandas.plotting import (
  parallel_coordinates,
)

fig, ax = plt.subplots(figsize=(6, 4))

new_df = X.copy()
new_df["target"] = y
parallel_coordinates(
  new_df,
  "target",
  ax=ax,
  colormap="viridis",
  alpha=0.5,
)

ax.set_xticklabels(
  ax.get_xticklabels(), rotation=45
)

[Text(0,0,'pclass'),
 Text(0,0,'age'),
 Text(0,0,'sibsp'),
 Text(0,0,'parch'),
 Text(0,0,'fare'),
 Text(0,0,'sex_male'),
 Text(0,0,'embarked_Q'),
 Text(0,0,'embarked_S')]

	pclass	embarked_S
count	1309.000000	1309.000000
mean	-0.012831	0.698243
std	0.995822	0.459196
min	-1.551881	0.000000
25%	-0.363317	0.000000
50%	0.825248	1.000000
75%	0.825248	1.000000
max	0.825248	1.000000

	pclass	age
pclass	1.000000	-0.439704
age	-0.439704	1.000000
sibsp	0.060832	-0.292056
parch	0.018322	-0.176447
fare	-0.558827	0.177200
sex_male	0.124617	0.065004
embarked_Q	0.230491	-0.053904
embarked_S	0.096335	-0.045361