17.1 PCA

from sklearn.decomposition import PCA
from sklearn.preprocessing import (
  StandardScaler,
)

pca = PCA(random_state=42)
X_pca = pca.fit_transform(StandardScaler().fit_transform(X))

pca.explained_variance_ratio_
array([0.23922833, 0.21616853, 0.1923158 , 0.10464906, 0.08154797,
       0.0727221 , 0.05130716, 0.04206107])
pca.components_[0]
array([-0.63274156,  0.39602149,  0.00653646,  0.11500362,  0.5815031 ,
       -0.19764926, -0.20422289, -0.10304598])
fig, ax = plt.subplots(figsize=(8, 6))

ax.plot(pca.explained_variance_ratio_)
ax.set(
  xlabel="Component",
  ylabel="Percent of Explained variance",
  title="Scree Plot",
  ylim=(0, 1),
)
[(0, 1),
 Text(0,0.5,'Percent of Explained variance'),
 Text(0.5,0,'Component'),
 Text(0.5,1,'Scree Plot')]
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(
  np.cumsum(pca.explained_variance_ratio_)
)

ax.set(
  xlabel="Component",
  ylabel="Percent of Explained variance",
  title="Cumulative Variance",
  ylim=(0, 1),
)
[(0, 1),
 Text(0,0.5,'Percent of Explained variance'),
 Text(0.5,0,'Component'),
 Text(0.5,1,'Cumulative Variance')]
fig, ax = plt.subplots(figsize=(8, 8))

plt.imshow(
  pca.components_.T,
  cmap="Spectral",
  vmin=-1,
  vmax=1,
)

plt.yticks(range(len(X.columns)), X.columns)
plt.xticks(range(8), range(1, 9))
plt.xlabel("Principal Component")
plt.ylabel("Contribution")
plt.title("Contribution of Features to Components")
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x7ff9f9d3ac90>
fig, ax = plt.subplots(figsize=(10, 8))

pd.DataFrame(
  pca.components_, columns=X.columns
).plot(kind="bar", ax=ax).legend(
  bbox_to_anchor=(1, 1)
)
<matplotlib.legend.Legend at 0x7ff9efad6e90>
comps = pd.DataFrame(pca.components_, columns=X.columns)

min_val = 0.5
num_components = 2
pca_cols = set()

for i in range(num_components):
  parts = comps.iloc[i][comps.iloc[i].abs() > min_val]
  pca_cols.update(set(parts.index))

pca_cols
{'fare', 'parch', 'pclass', 'sibsp'}
from yellowbrick.features.pca import (
  PCADecomposition,
)

fig, ax = plt.subplots(figsize=(10, 8))

colors = ["rg"[j] for j in y]
pca_viz = PCADecomposition(color=colors)
pca_viz.fit_transform(X, y)
pca_viz.poof()
import seaborn as sns

fig, ax = plt.subplots(figsize=(10, 8))

pca_df = pd.DataFrame(
  X_pca,
  columns=[
    f"PC{i+1}"
    for i in range(X_pca.shape[1])
  ]
)

pca_df["status"] = [
  ("deceased", "survived")[i] for i in y
]

evr = pca.explained_variance_ratio_
ax.set_aspect(evr[1] / evr[0])
sns.scatterplot(
  x="PC1",
  y="PC2",
  hue="status",
  data=pca_df,
  alpha=0.5,
  ax=ax,
)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff9ed051810>
fig, ax = plt.subplots(figsize=(10, 8))
pca_df = pd.DataFrame(
  X_pca,
  columns=[
    f"PC{i+1}"
    for i in range(X_pca.shape[1])
  ],
)

pca_df["status"] = [
  ("deceased", "survived")[i] for i in y
]

evr = pca.explained_variance_ratio_
x_idx = 0 # x_pc
y_idx = 1 # y_pc

ax.set_aspect(evr[y_idx] / evr[x_idx])
x_col = pca_df.columns[x_idx]
y_col = pca_df.columns[y_idx]
sns.scatterplot(
  x=x_col,
  y=y_col,
  hue="status",
  data=pca_df,
  alpha=0.5,
  ax=ax,
)

scale = 8

comps = pd.DataFrame(
  pca.components_, columns=X.columns
)

for idx, s in comps.T.iterrows():
  plt.arrow(
    0,
    0,
    s[x_idx] * scale,
    s[y_idx] * scale,
    color="k",
  )
  plt.text(
    s[x_idx] * scale,
    s[y_idx] * scale,
    idx,
    weight="bold",
  )
fig, ax = plt.subplots(figsize=(10, 8))
pca_df = pd.DataFrame(
  X_pca,
  columns=[
    f"PC{i+1}"
    for i in range(X_pca.shape[1])
  ],
)

pca_df["status"] = [
  ("deceased", "survived")[i] for i in y
]

evr = pca.explained_variance_ratio_
ax.set_aspect(evr[3] / evr[0])

sns.scatterplot(
  x="PC1",
  y="PC4",
  hue="status",
  data=pca_df,
  alpha=0.5,
  ax=ax,
)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff9ecf19710>
from bokeh.io import output_notebook
from bokeh import models, palettes, transform
from bokeh.plotting import figure, show

def bokeh_scatter(x, y, data,
                  hue=None, label_cols=None,
                  size=None, legend=None, alpha=0.5,):
  output_notebook()
  circle_kwargs = {}

  if legend:
    circle_kwargs["legend"] = legend
  if size:
    circle_kwargs["size"] = size
  if hue:
    color_seq = data[hue]
    mapper = models.LinearColorMapper(
      palette=palettes.viridis(256),
      low=min(color_seq),
      high=max(color_seq),
    )
    circle_kwargs[
      "fill_color"
    ] = transform.transform(hue, mapper)

  ds = models.ColumnDataSource(data)
  if label_cols is None:
    label_cols = data.columns
  
  tool_tips = sorted(
    [
      (x, "@{}".format(x))
      for x in label_cols
    ],
    key=lambda tup: tup[0],
  )

  hover = models.HoverTool(
    tooltips=tool_tips
  )

  fig = figure(
    tools=[
      hover,
      "pan",
      "zoom_in",
      "zoom_out",
      "reset",
    ],
    toolbar_location="below",
  )

  fig.circle(
    x,
    y,
    source=ds,
    alpha=alpha,
    **circle_kwargs
  )
  show(fig)
  return fig

res = bokeh_scatter(
  "PC1",
  "PC2",
  data=pca_df.assign(
    surv=y.reset_index(drop=True)
  ),
  hue="surv",
  size=10,
  legend="surv",
)

plt.show()
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
from yellowbrick.features.pca import (
  PCADecomposition,
)

colors = ["rg"[j] for j in y]
pca3_viz = PCADecomposition(
  proj_dim=3, color=colors
)

pca3_viz.fit_transform(X, y)
pca3_viz.finalize()
fig = plt.gcf()
plt.tight_layout()
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection="3d")

ax.scatter(
  xs=X_pca[:, 0],
  ys=X_pca[:, 1],
  zs=X_pca[:, 2],
  c=y,
  cmap="viridis",
)

ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_zlabel("PC 3")
Text(0.5,0,'PC 3')

17.2 UMAP

import umap

u = umap.UMAP(random_state=42)
X_umap = u.fit_transform(
  StandardScaler().fit_transform(X)
)
X_umap.shape
(1309, 2)
fig, ax = plt.subplots(figsize=(10, 10))

pd.DataFrame(X_umap).plot(
  kind="scatter",
  x=0,
  y=1,
  ax=ax,
  c=y,
  alpha=0.2,
  cmap="Spectral",
)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff9af559550>
X_std = StandardScaler().fit_transform(X)

fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.reshape(4)

for i, n in enumerate([2, 5, 10, 50]):
  ax = axes[i]
  u = umap.UMAP(
    random_state=42, n_neighbors=n
  )
  X_umap = u.fit_transform(X_std)

  pd.DataFrame(X_umap).plot(
    kind="scatter",
    x=0,
    y=1,
    ax=ax,
    c=y,
    cmap="Spectral",
    alpha=0.5,
  )
  ax.set_title(f"nn={n}")

plt.tight_layout()
fig, axes = plt.subplots(2, 2, figsize=(10, 10))

axes = axes.reshape(4)
for i, n in enumerate([0, 0.33, 0.66, 0.99]):
  ax = axes[i]
  u = umap.UMAP(random_state=42, min_dist=n)
  X_umap = u.fit_transform(X_std)

  pd.DataFrame(X_umap).plot(
    kind="scatter",
    x=0,
    y=1,
    ax=ax,
    c=y,
    cmap="Spectral",
    alpha=0.5,
  )
  ax.set_title(f"min_dist={n}")

plt.tight_layout()

17.3 t-SNE

from sklearn.manifold import TSNE
X_std = StandardScaler().fit_transform(X)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
fig, ax = plt.subplots(figsize=(10, 10))

colors = ["rg"[j] for j in y]
scat = ax.scatter(
  X_tsne[:, 0],
  X_tsne[:, 1],
  c=colors,
  alpha=0.5,
)

ax.set_xlabel("Embedding 1")
ax.set_ylabel("Embedding 2")
Text(0,0.5,'Embedding 2')
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.reshape(4)

for i, n in enumerate((2, 30, 50, 100)):
  ax = axes[i]
  t = TSNE(random_state=42, perplexity=n)
  X_tsne = t.fit_transform(X)

  pd.DataFrame(X_tsne).plot(
    kind="scatter",
    x=0,
    y=1,
    ax=ax,
    c=y,
    cmap="Spectral",
    alpha=0.5,
  )
  ax.set_title(f"perplexity={n}")

plt.tight_layout()