4장 누락된 데이터
누락된 데이터를 분석하는 방법을 다룹니다.
url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
orig_df = df
df.isnull().mean() * 100
import missingno as msno
ax = msno.matrix(orig_df.sample(500))
plt.show()
fig, ax = plt.subplots(figsize=(6, 4))
(1 - df.isnull().mean()).abs().plot.bar(ax=ax)
plt.show()
ax = msno.bar(orig_df.sample(500))
plt.show()
ax = msno.heatmap(df, figsize=(6, 6))
plt.show()
ax = msno.dendrogram(df)
plt.show()
df1 = df.dropna()
df1 = df.drop(columns="cabin")
df1 = df.dropna(axis=1)
from sklearn.impute import SimpleImputer
num_cols = df.select_dtypes(
include="number"
).columns
im = SimpleImputer() # 평균
imputed = im.fit_transform(df[num_cols])
def add_indicator(col):
def wrapper(df):
return df[col].isna().astype(int)
return wrapper
df1 = df.assign(cabin_missing=add_indicator("cabin"))
df1.head(10)