18.1 K-평균

from sklearn.cluster import KMeans

X_std = preprocessing.StandardScaler().fit_transform(X)
km = KMeans(2, random_state=42)
km.fit(X_std)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)
X_km = km.predict(X)
X_km
array([1, 1, 1, ..., 1, 1, 1], dtype=int32)
inertias = []
sizes = range(2, 12)

for k in sizes:
  k2 = KMeans(random_state=42, n_clusters=k)
  k2.fit(X)
  inertias.append(k2.inertia_)

fig, ax = plt.subplots(figsize=(8, 8))
pd.Series(inertias, index=sizes).plot(ax=ax)
ax.set_xlabel("K")
ax.set_ylabel("Inertia")
Text(0,0.5,'Inertia')
from sklearn import metrics

inertias = []
sils = []
chs = []
dbs = []
sizes = range(2, 12)

for k in sizes:
  k2 = KMeans(random_state=42, n_clusters=k)
  k2.fit(X_std)
  inertias.append(k2.inertia_)
  sils.append(
    metrics.silhouette_score(X, k2.labels_)
  )
  chs.append(
    metrics.calinski_harabasz_score(
      X, k2.labels_
    )
  )
  dbs.append(
    metrics.davies_bouldin_score(
      X, k2.labels_
    )
  )

fig, ax = plt.subplots(figsize=(10, 10))
(
  pd.DataFrame(
    {
      "inertia": inertias,
      "silhouette": sils,
      "calinski": chs,
      "davis": dbs,
      "k": sizes,
    }
  )
  .set_index("k")
  .plot(ax=ax, subplots=True, layout=(2, 2))
)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f96cbb49310>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f96cbb92f50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f96c059d890>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f96c054f1d0>]],
      dtype=object)
from yellowbrick.cluster.silhouette import (
  SilhouetteVisualizer,
)

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.reshape(4)

for i, k in enumerate(range(2, 6)):
  ax = axes[i]
  sil = SilhouetteVisualizer(
    KMeans(n_clusters=k, random_state=42),
    ax=ax,
  )
  sil.fit(X_std)
  sil.finalize()
  ax.set_xlim(-0.2, 0.8)

plt.tight_layout()

18.2 응집 클러스터링

from scipy.cluster import hierarchy

fig, ax = plt.subplots(figsize=(20, 10))
dend = hierarchy.dendrogram(
  hierarchy.linkage(X_std, method="ward")
)
from scipy.cluster import hierarchy

fig, ax = plt.subplots(figsize=(20, 10))
dend = hierarchy.dendrogram(
  hierarchy.linkage(X_std, method="ward"),
  truncate_mode="lastp",
  p=20,
  show_contracted=True,
)
from sklearn.cluster import (
  AgglomerativeClustering,
)

ag = AgglomerativeClustering(
  n_clusters=4,
  affinity="euclidean",
  linkage="ward",
)
ag.fit(X)
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='ward', memory=None, n_clusters=4,
                        pooling_func='deprecated')

18.3 클러스터의 이해

km = KMeans(n_clusters=2)
km.fit(X_std)
labels = km.predict(X_std)
(
  X.assign(cluster=labels, survived=y)
  .groupby("cluster")
  .agg(["mean", "var"])
  .T
)
cluster 0 1
pclass mean 0.527478 -1.420910
var 0.265532 0.138897
age mean -0.281925 0.922530
var 0.651165 1.145415
sibsp mean -0.009948 -0.108926
var 1.164827 0.303463
parch mean 0.387949 0.377410
var 0.830288 0.539488
fare mean -0.349293 0.882876
var 0.056372 2.223786
sex_male mean 0.678647 0.553719
var 0.218316 0.247797
embarked_Q mean 0.123679 0.016529
var 0.108497 0.016301
embarked_S mean 0.741015 0.586777
var 0.192115 0.243140
survived mean 0.300211 0.595041
var 0.210307 0.241633
fig, ax = plt.subplots(figsize=(10, 6))
(
  X.assign(cluster=labels, survived=y)
  .groupby("cluster")
  .mean()
  .T.plot.bar(ax=ax)
)
<matplotlib.axes._subplots.AxesSubplot at 0x7f96bd6cd290>
from sklearn.decomposition import PCA
from sklearn.preprocessing import (
    StandardScaler,
)
import seaborn as sns

fig, ax = plt.subplots(figsize=(10, 8))

pca = PCA(random_state=42)
X_pca = pca.fit_transform(
    StandardScaler().fit_transform(X)
)
sns.scatterplot(
  "PC1",
  "PC2",
  data=X.assign(
    PC1=X_pca[:, 0],
    PC2=X_pca[:, 1],
    cluster=labels,
  ),
  hue="cluster",
  alpha=0.5,
  ax=ax,
)
<matplotlib.axes._subplots.AxesSubplot at 0x7f96bcecf490>
(
  X.assign(cluster=labels)
  .groupby("cluster")
  .age.describe()
  .T
)
cluster 0 1
count 946.000000 363.000000
mean -0.281925 0.922530
std 0.806948 1.070241
min -2.221251 -2.162722
25% -0.628414 0.184938
50% -0.175241 0.809247
75% 0.106899 1.667672
max 3.540599 4.008830
dt = tree.DecisionTreeClassifier()
dt.fit(X, labels)

for col, val in sorted(zip(X.columns, dt.feature_importances_),
                       key=lambda col_val: col_val[1],
                       reverse=True):
  print(f"{col:10}{val:10.3f}")
pclass         0.902
age            0.077
sex_male       0.013
embarked_S     0.003
fare           0.003
parch          0.003
sibsp          0.000
embarked_Q     0.000
from IPython.display import Image

from io import StringIO
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()

export_graphviz(
  dt,
  out_file=dot_data,
  feature_names=X.columns,
  class_names=["0", "1"],
  max_depth=2,
  filled=True,
)

g = pydotplus.graph_from_dot_data(
  dot_data.getvalue()
)

Image(g.create_png())