728x90
In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
타이타닉 데이터셋은 너무너무 유명한 데이터셋입니다. 머신러닝을 공부하는 사람이라면 아마 누구나 타이타닉 데이터로 시작했을 것 같아요. 저도 마찬가지이구요. 타이타닉 데이터에 대한 내용은 이미 알고 있으므로 이번 코드에선 다양한 시각화 과정, 모델 성능 향샹은 제외했으며, 그리드 서치를 구현하는 데에 집중했습니다.
In [0]:
df = pd.read_csv('./train.csv')
print(df.shape)
display(df.head())
In [0]:
df.info()
In [0]:
display(df.isnull().sum())
In [0]:
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
df.head(3)
Out[0]:
In [0]:
display(df['Cabin'].value_counts())
display(df['Embarked'].value_counts())
In [0]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Cabin'].fillna('n', inplace=True)
df['Embarked'].fillna('S', inplace=True)
display(df.isnull().sum())
In [0]:
df['Cabin'] = df['Cabin'].str[:1]
df['Cabin'].head()
Out[0]:
In [0]:
df[df['Age'] <= 0]
Out[0]:
In [0]:
def age(x):
Age = ''
if x <= 12: Age='Child'
elif x <= 19: Age='Teen'
elif x <= 30: Age='Young_adult'
elif x <= 60: Age='Adult'
else: Age='Old'
return Age
plt.figure(figsize=(13, 7))
names = ['Child', 'Teen', 'Young_adult', 'Adult', 'Old']
df['Age'] = df['Age'].apply(lambda x : age(x))
sns.barplot(data=df, x='Age', y='Survived', order=names, hue='Sex') #order : x축 순서
Out[0]:
In [0]:
df.head()
Out[0]:
In [0]:
df.Age.isnull().any()
Out[0]:
In [0]:
# encoding
from sklearn.preprocessing import LabelEncoder
def encoding(x):
for i in ['Sex', 'Age', 'Cabin', 'Embarked']:
x[i] = LabelEncoder().fit_transform(x[i])
return x
In [0]:
df = encoding(df)
df.head()
Out[0]:
In [0]:
# onehotencoding (pd.get_dummies)
df = pd.get_dummies(df, columns=['Pclass', 'Sex', 'Age', 'Embarked'])
df.head()
Out[0]:
In [0]:
# 레이블 데이터 분리
y = df['Survived']
X = df.drop('Survived', axis=1)
display(X.shape)
display(y.shape)
In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
In [0]:
# 교차검증
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
xgb = XGBClassifier(random_state=44)
gb = GradientBoostingClassifier(random_state=44)
rf = RandomForestClassifier(random_state=44)
xgb_cross = cross_val_score(xgb, X, y, cv=5, verbose=1)
gb_cross = cross_val_score(gb, X, y, cv=5, verbose=1)
rf_cross = cross_val_score(rf, X, y, cv=5, verbose=1)
for count, accuracy in enumerate(xgb_cross):
print('XGB {}번째 accuracy : {:.3f}'.format(count, accuracy))
print('XGB 평균 성능 : {:.3f}'.format(np.mean(xgb_cross)))
print('--------------------------------------')
for count, accuracy in enumerate(gb_cross):
print('GB {}번째 accuracy : {:.3f}'.format(count, accuracy))
print('GB 평균 성능 : {:.3f}'.format(np.mean(gb_cross)))
print('--------------------------------------')
for count, accuracy in enumerate(rf_cross):
print('RF {}번째 accuracy : {:.3f}'.format(count, accuracy))
print('RF 평균 성능 : {:.3f}'.format(np.mean(rf_cross)))
In [0]:
# 그리드서치
from sklearn.model_selection import GridSearchCV
xgb_param = {
'min_child_weight': [1, 5, 10],
'gamma': [0.5, 1, 1.5, 2, 5],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'max_depth': [3, 4, 5]
}
gb_param = {
# 'n_estimators': [100],
'min_samples_leaf': [7, 9, 13],
'max_depth': [4, 5, 6, 7],
'learning_rate': [0.05, 0.02, 0.01],
}
grid_xgb = GridSearchCV(xgb, param_grid=xgb_param, scoring='accuracy', cv=5)
grid_gb = GridSearchCV(gb, param_grid=gb_param, scoring='accuracy', cv=5)
grid_xgb.fit(X_train, y_train)
grid_gb.fit(X_train, y_train)
Out[0]:
In [0]:
print('xgboost best param : ',grid_xgb.best_params_)
print('xgboost best accuracy : ',grid_xgb.best_score_)
print('gradient boosting best param : ',grid_gb.best_params_)
print('gradient boosting best accuracy : ',grid_gb.best_score_)
# best parameter로 학습된 모델로 테스트 데이터 예측 및 평가
xgb_pred = grid_xgb.best_estimator_.predict(X_test)
gb_pred = grid_gb.best_estimator_.predict(X_test)
print('xgboost accuracy(test set) : {:.3f}'.format(accuracy_score(y_test, xgb_pred)))
print('gradient boosting accuracy(test set) : {:.3f}'.format(accuracy_score(y_test, gb_pred)))
# 랜덤포레스트
rf.fit(X_train, y_train)
print('randomforest accuracy(test set) : {:.3f}'.format(accuracy_score(y_test, rf.predict(X_test))))
반응형
LIST
'Legacy > [Legacy] Machine Learning' 카테고리의 다른 글
케라스로 mnist 인식하는 모델 만들기 (0) | 2019.06.01 |
---|---|
LabelEncoder & OneHotEncoder (0) | 2019.02.27 |
Expected 2D array, got 1D array instead 에러 : flatten(), reshape(), ravel() 알아보기 (0) | 2019.02.23 |
교차검증 (cross-validation) (2) | 2019.02.15 |
feature engineering 특성공학(원핫인코딩, get_dummies, RFE) (0) | 2019.02.13 |