6장 탐색
데이터를 탐색하는 방법을 다룹니다.
- 6.1 데이터의 크기
- 6.2 요약 통계
- 6.3 히스토그램
- 6.4 산점도
- 6.5 조인트 플롯
- 6.6 쌍 격자
- 6.7 박스 플롯과 바이올린 플롯
- 6.8 두 순서형 값의 비교
- 6.9 상관관계
- 6.10 라드비즈
- 6.11 평행 좌표
X.shape
X.describe().iloc[:, [0, -1]]
fig, ax = plt.subplots(figsize=(6, 4))
X.fare.plot(kind="hist", ax=ax)
import seaborn as sns
fig, ax = plt.subplots(figsize=(12, 8))
mask = y_train == 1
ax = sns.distplot(X_train[mask].fare, label='survived')
ax = sns.distplot(X_train[~mask].fare, label='died')
ax.set_xlim(-1.5, 1.5)
ax.legend()
fig, ax = plt.subplots(figsize=(6, 4))
X.plot.scatter(x="age", y="fare", ax=ax, alpha=0.3)
X.age.corr(X.fare)
from yellowbrick.features import (
JointPlotVisualizer,
)
fig, ax = plt.subplots(figsize=(6, 6))
jpv = JointPlotVisualizer(feature="age", target="fare")
jpv.fit(X["age"], X["fare"])
jpv.poof()
from seaborn import jointplot
fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
p = jointplot("age", "fare", data=new_df, kind="reg")
from seaborn import pairplot
fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
vars = ["pclass", "age", "fare"]
p = pairplot(new_df, vars=vars, hue="target", kind="reg")
from seaborn import boxplot
fig, ax = plt.subplots(figsize=(8, 6))
new_df = X.copy()
new_df["target"] = y
boxplot(x="target", y="age", data=new_df)
from seaborn import violinplot
fig, ax = plt.subplots(figsize=(8, 6))
new_df = X.copy()
new_df["target"] = y
violinplot(x="target", y="sex_male", data=new_df)
fig, ax = plt.subplots(figsize=(8, 6))
(
X.assign(
age_bin=pd.qcut(X.age, q=10, labels=False),
class_bin=pd.cut(X.pclass, bins=3, labels=False),
)
.groupby(["age_bin", "class_bin"])
.size()
.unstack()
.pipe(lambda df: df.div(df.sum(1), axis=0))
.plot.bar(stacked=True, width=1, ax=ax, cmap="viridis")
.legend(bbox_to_anchor=(1, 1))
)
from yellowbrick.features import Rank2D
fig, ax = plt.subplots(figsize=(6, 6))
pcv = Rank2D(features=X.columns, algorithm="pearson")
pcv.fit(X, y)
pcv.transform(X)
pcv.poof()
from seaborn import heatmap
fig, ax = plt.subplots(figsize=(8, 8))
ax = heatmap(
X.corr(),
fmt=".2f",
annot=True,
ax=ax,
cmap="RdBu_r",
vmin=-1,
vmax=1,
)
X.corr().iloc[:, :2]
import numpy as np
def correlated_columns(df, threshold=0.95):
return (
df.corr().pipe(
lambda df1: pd.DataFrame(
np.tril(df1, k=-1), columns=df.columns, index=df.columns
)
)
.stack()
.rename("pearson")
.pipe(
lambda s: s[
s.abs() > threshold
].reset_index()
)
.query("level_0 not in level_1")
)
correlated_columns(X)
from yellowbrick.features import RadViz
fig, ax = plt.subplots(figsize=(6, 6))
rv = RadViz(
classes=["died", "survived"],
features=X.columns,
)
rv.fit(X, y)
_ = rv.transform(X)
rv.poof()
from pandas.plotting import radviz
fig, ax = plt.subplots(figsize=(6, 6))
new_df = X.copy()
new_df["target"] = y
radviz(new_df, "target", ax=ax, colormap="PiYG")
from yellowbrick.features import (
ParallelCoordinates,
)
fig, ax = plt.subplots(figsize=(6, 4))
pc = ParallelCoordinates(
classes=["died", "survived"],
features=X.columns,
)
pc.fit(X, y)
pc.transform(X)
ax.set_xticklabels(
ax.get_xticklabels(), rotation=45
)
pc.poof()
from pandas.plotting import (
parallel_coordinates,
)
fig, ax = plt.subplots(figsize=(6, 4))
new_df = X.copy()
new_df["target"] = y
parallel_coordinates(
new_df,
"target",
ax=ax,
colormap="viridis",
alpha=0.5,
)
ax.set_xticklabels(
ax.get_xticklabels(), rotation=45
)