18장 클러스터링
클러스터링 기법을 알아봅니다.
from sklearn.cluster import KMeans
X_std = preprocessing.StandardScaler().fit_transform(X)
km = KMeans(2, random_state=42)
km.fit(X_std)
X_km = km.predict(X)
X_km
inertias = []
sizes = range(2, 12)
for k in sizes:
k2 = KMeans(random_state=42, n_clusters=k)
k2.fit(X)
inertias.append(k2.inertia_)
fig, ax = plt.subplots(figsize=(8, 8))
pd.Series(inertias, index=sizes).plot(ax=ax)
ax.set_xlabel("K")
ax.set_ylabel("Inertia")
from sklearn import metrics
inertias = []
sils = []
chs = []
dbs = []
sizes = range(2, 12)
for k in sizes:
k2 = KMeans(random_state=42, n_clusters=k)
k2.fit(X_std)
inertias.append(k2.inertia_)
sils.append(
metrics.silhouette_score(X, k2.labels_)
)
chs.append(
metrics.calinski_harabasz_score(
X, k2.labels_
)
)
dbs.append(
metrics.davies_bouldin_score(
X, k2.labels_
)
)
fig, ax = plt.subplots(figsize=(10, 10))
(
pd.DataFrame(
{
"inertia": inertias,
"silhouette": sils,
"calinski": chs,
"davis": dbs,
"k": sizes,
}
)
.set_index("k")
.plot(ax=ax, subplots=True, layout=(2, 2))
)
from yellowbrick.cluster.silhouette import (
SilhouetteVisualizer,
)
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.reshape(4)
for i, k in enumerate(range(2, 6)):
ax = axes[i]
sil = SilhouetteVisualizer(
KMeans(n_clusters=k, random_state=42),
ax=ax,
)
sil.fit(X_std)
sil.finalize()
ax.set_xlim(-0.2, 0.8)
plt.tight_layout()
from scipy.cluster import hierarchy
fig, ax = plt.subplots(figsize=(20, 10))
dend = hierarchy.dendrogram(
hierarchy.linkage(X_std, method="ward")
)
from scipy.cluster import hierarchy
fig, ax = plt.subplots(figsize=(20, 10))
dend = hierarchy.dendrogram(
hierarchy.linkage(X_std, method="ward"),
truncate_mode="lastp",
p=20,
show_contracted=True,
)
from sklearn.cluster import (
AgglomerativeClustering,
)
ag = AgglomerativeClustering(
n_clusters=4,
affinity="euclidean",
linkage="ward",
)
ag.fit(X)
km = KMeans(n_clusters=2)
km.fit(X_std)
labels = km.predict(X_std)
(
X.assign(cluster=labels, survived=y)
.groupby("cluster")
.agg(["mean", "var"])
.T
)
fig, ax = plt.subplots(figsize=(10, 6))
(
X.assign(cluster=labels, survived=y)
.groupby("cluster")
.mean()
.T.plot.bar(ax=ax)
)
from sklearn.decomposition import PCA
from sklearn.preprocessing import (
StandardScaler,
)
import seaborn as sns
fig, ax = plt.subplots(figsize=(10, 8))
pca = PCA(random_state=42)
X_pca = pca.fit_transform(
StandardScaler().fit_transform(X)
)
sns.scatterplot(
"PC1",
"PC2",
data=X.assign(
PC1=X_pca[:, 0],
PC2=X_pca[:, 1],
cluster=labels,
),
hue="cluster",
alpha=0.5,
ax=ax,
)
(
X.assign(cluster=labels)
.groupby("cluster")
.age.describe()
.T
)
dt = tree.DecisionTreeClassifier()
dt.fit(X, labels)
for col, val in sorted(zip(X.columns, dt.feature_importances_),
key=lambda col_val: col_val[1],
reverse=True):
print(f"{col:10}{val:10.3f}")
from IPython.display import Image
from io import StringIO
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(
dt,
out_file=dot_data,
feature_names=X.columns,
class_names=["0", "1"],
max_depth=2,
filled=True,
)
g = pydotplus.graph_from_dot_data(
dot_data.getvalue()
)
Image(g.create_png())