17장 차원성 감소
차원성 감소 기법을 알아봅니다.
from sklearn.decomposition import PCA
from sklearn.preprocessing import (
StandardScaler,
)
pca = PCA(random_state=42)
X_pca = pca.fit_transform(StandardScaler().fit_transform(X))
pca.explained_variance_ratio_
pca.components_[0]
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(pca.explained_variance_ratio_)
ax.set(
xlabel="Component",
ylabel="Percent of Explained variance",
title="Scree Plot",
ylim=(0, 1),
)
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(
np.cumsum(pca.explained_variance_ratio_)
)
ax.set(
xlabel="Component",
ylabel="Percent of Explained variance",
title="Cumulative Variance",
ylim=(0, 1),
)
fig, ax = plt.subplots(figsize=(8, 8))
plt.imshow(
pca.components_.T,
cmap="Spectral",
vmin=-1,
vmax=1,
)
plt.yticks(range(len(X.columns)), X.columns)
plt.xticks(range(8), range(1, 9))
plt.xlabel("Principal Component")
plt.ylabel("Contribution")
plt.title("Contribution of Features to Components")
plt.colorbar()
fig, ax = plt.subplots(figsize=(10, 8))
pd.DataFrame(
pca.components_, columns=X.columns
).plot(kind="bar", ax=ax).legend(
bbox_to_anchor=(1, 1)
)
comps = pd.DataFrame(pca.components_, columns=X.columns)
min_val = 0.5
num_components = 2
pca_cols = set()
for i in range(num_components):
parts = comps.iloc[i][comps.iloc[i].abs() > min_val]
pca_cols.update(set(parts.index))
pca_cols
from yellowbrick.features.pca import (
PCADecomposition,
)
fig, ax = plt.subplots(figsize=(10, 8))
colors = ["rg"[j] for j in y]
pca_viz = PCADecomposition(color=colors)
pca_viz.fit_transform(X, y)
pca_viz.poof()
import seaborn as sns
fig, ax = plt.subplots(figsize=(10, 8))
pca_df = pd.DataFrame(
X_pca,
columns=[
f"PC{i+1}"
for i in range(X_pca.shape[1])
]
)
pca_df["status"] = [
("deceased", "survived")[i] for i in y
]
evr = pca.explained_variance_ratio_
ax.set_aspect(evr[1] / evr[0])
sns.scatterplot(
x="PC1",
y="PC2",
hue="status",
data=pca_df,
alpha=0.5,
ax=ax,
)
fig, ax = plt.subplots(figsize=(10, 8))
pca_df = pd.DataFrame(
X_pca,
columns=[
f"PC{i+1}"
for i in range(X_pca.shape[1])
],
)
pca_df["status"] = [
("deceased", "survived")[i] for i in y
]
evr = pca.explained_variance_ratio_
x_idx = 0 # x_pc
y_idx = 1 # y_pc
ax.set_aspect(evr[y_idx] / evr[x_idx])
x_col = pca_df.columns[x_idx]
y_col = pca_df.columns[y_idx]
sns.scatterplot(
x=x_col,
y=y_col,
hue="status",
data=pca_df,
alpha=0.5,
ax=ax,
)
scale = 8
comps = pd.DataFrame(
pca.components_, columns=X.columns
)
for idx, s in comps.T.iterrows():
plt.arrow(
0,
0,
s[x_idx] * scale,
s[y_idx] * scale,
color="k",
)
plt.text(
s[x_idx] * scale,
s[y_idx] * scale,
idx,
weight="bold",
)
fig, ax = plt.subplots(figsize=(10, 8))
pca_df = pd.DataFrame(
X_pca,
columns=[
f"PC{i+1}"
for i in range(X_pca.shape[1])
],
)
pca_df["status"] = [
("deceased", "survived")[i] for i in y
]
evr = pca.explained_variance_ratio_
ax.set_aspect(evr[3] / evr[0])
sns.scatterplot(
x="PC1",
y="PC4",
hue="status",
data=pca_df,
alpha=0.5,
ax=ax,
)
from bokeh.io import output_notebook
from bokeh import models, palettes, transform
from bokeh.plotting import figure, show
def bokeh_scatter(x, y, data,
hue=None, label_cols=None,
size=None, legend=None, alpha=0.5,):
output_notebook()
circle_kwargs = {}
if legend:
circle_kwargs["legend"] = legend
if size:
circle_kwargs["size"] = size
if hue:
color_seq = data[hue]
mapper = models.LinearColorMapper(
palette=palettes.viridis(256),
low=min(color_seq),
high=max(color_seq),
)
circle_kwargs[
"fill_color"
] = transform.transform(hue, mapper)
ds = models.ColumnDataSource(data)
if label_cols is None:
label_cols = data.columns
tool_tips = sorted(
[
(x, "@{}".format(x))
for x in label_cols
],
key=lambda tup: tup[0],
)
hover = models.HoverTool(
tooltips=tool_tips
)
fig = figure(
tools=[
hover,
"pan",
"zoom_in",
"zoom_out",
"reset",
],
toolbar_location="below",
)
fig.circle(
x,
y,
source=ds,
alpha=alpha,
**circle_kwargs
)
show(fig)
return fig
res = bokeh_scatter(
"PC1",
"PC2",
data=pca_df.assign(
surv=y.reset_index(drop=True)
),
hue="surv",
size=10,
legend="surv",
)
plt.show()
from yellowbrick.features.pca import (
PCADecomposition,
)
colors = ["rg"[j] for j in y]
pca3_viz = PCADecomposition(
proj_dim=3, color=colors
)
pca3_viz.fit_transform(X, y)
pca3_viz.finalize()
fig = plt.gcf()
plt.tight_layout()
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
xs=X_pca[:, 0],
ys=X_pca[:, 1],
zs=X_pca[:, 2],
c=y,
cmap="viridis",
)
ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_zlabel("PC 3")
import umap
u = umap.UMAP(random_state=42)
X_umap = u.fit_transform(
StandardScaler().fit_transform(X)
)
X_umap.shape
fig, ax = plt.subplots(figsize=(10, 10))
pd.DataFrame(X_umap).plot(
kind="scatter",
x=0,
y=1,
ax=ax,
c=y,
alpha=0.2,
cmap="Spectral",
)
X_std = StandardScaler().fit_transform(X)
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.reshape(4)
for i, n in enumerate([2, 5, 10, 50]):
ax = axes[i]
u = umap.UMAP(
random_state=42, n_neighbors=n
)
X_umap = u.fit_transform(X_std)
pd.DataFrame(X_umap).plot(
kind="scatter",
x=0,
y=1,
ax=ax,
c=y,
cmap="Spectral",
alpha=0.5,
)
ax.set_title(f"nn={n}")
plt.tight_layout()
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.reshape(4)
for i, n in enumerate([0, 0.33, 0.66, 0.99]):
ax = axes[i]
u = umap.UMAP(random_state=42, min_dist=n)
X_umap = u.fit_transform(X_std)
pd.DataFrame(X_umap).plot(
kind="scatter",
x=0,
y=1,
ax=ax,
c=y,
cmap="Spectral",
alpha=0.5,
)
ax.set_title(f"min_dist={n}")
plt.tight_layout()
from sklearn.manifold import TSNE
X_std = StandardScaler().fit_transform(X)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
fig, ax = plt.subplots(figsize=(10, 10))
colors = ["rg"[j] for j in y]
scat = ax.scatter(
X_tsne[:, 0],
X_tsne[:, 1],
c=colors,
alpha=0.5,
)
ax.set_xlabel("Embedding 1")
ax.set_ylabel("Embedding 2")
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.reshape(4)
for i, n in enumerate((2, 30, 50, 100)):
ax = axes[i]
t = TSNE(random_state=42, perplexity=n)
X_tsne = t.fit_transform(X)
pd.DataFrame(X_tsne).plot(
kind="scatter",
x=0,
y=1,
ax=ax,
c=y,
cmap="Spectral",
alpha=0.5,
)
ax.set_title(f"perplexity={n}")
plt.tight_layout()