필요한 패키지

데이터의 수집

url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
orig_df = df

4.1 누락된 데이터의 분석

df.isnull().mean() * 100
pclass        0.000000
survived      0.000000
name          0.000000
sex           0.000000
age          20.091673
sibsp         0.000000
parch         0.000000
ticket        0.000000
fare          0.076394
cabin        77.463713
embarked      0.152788
boat         62.872422
body         90.756303
home.dest    43.086325
dtype: float64
import missingno as msno
ax = msno.matrix(orig_df.sample(500))
plt.show()
fig, ax = plt.subplots(figsize=(6, 4))
(1 - df.isnull().mean()).abs().plot.bar(ax=ax)
plt.show()
ax = msno.bar(orig_df.sample(500))
plt.show()
ax = msno.heatmap(df, figsize=(6, 6))
plt.show()
ax = msno.dendrogram(df)
plt.show()

4.2 누락된 데이터의 삭제

df1 = df.dropna()
df1 = df.drop(columns="cabin")
df1 = df.dropna(axis=1)

4.3 데이터의 대치

from sklearn.impute import SimpleImputer

num_cols = df.select_dtypes(
  include="number"
).columns

im = SimpleImputer() # 평균
imputed = im.fit_transform(df[num_cols])

4.4 지시자 열의 추가

def add_indicator(col):
  def wrapper(df):
    return df[col].isna().astype(int)

  return wrapper

df1 = df.assign(cabin_missing=add_indicator("cabin"))
df1.head(10)
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest cabin_missing
0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO 0
1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON 0
2 1 0 Allison, Miss. Helen Loraine female 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON 0
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON 0
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON 0
5 1 1 Anderson, Mr. Harry male 48.0000 0 0 19952 26.5500 E12 S 3 NaN New York, NY 0
6 1 1 Andrews, Miss. Kornelia Theodosia female 63.0000 1 0 13502 77.9583 D7 S 10 NaN Hudson, NY 0
7 1 0 Andrews, Mr. Thomas Jr male 39.0000 0 0 112050 0.0000 A36 S NaN NaN Belfast, NI 0
8 1 1 Appleton, Mrs. Edward Dale (Charlotte Lamson) female 53.0000 2 0 11769 51.4792 C101 S D NaN Bayside, Queens, NY 0
9 1 0 Artagaveytia, Mr. Ramon male 71.0000 0 0 PC 17609 49.5042 NaN C NaN 22.0 Montevideo, Uruguay 1