Machine Learning with Model Stacking in Python

  Demo in Titanic Data

Posted by Haby on June 23, 2017
# -*- coding: utf-8 -*-
"""
        Created on Tue Feb 10 17:37:31 2018

        Titanic Competition in Kaggle
        Dataset path : E:\Data_and_Script\Python_Script\titanic\train.csv'
        Data from : Kaggle.com

        Python 3.6.3
        GUI : Spyder from Anaconda 5.0.1
        OS : windows 10 v1709 64 bit

        P1 : EDA
        P2 : Feature Engineering
        P3 : Model and Ensemble

        @author: Haby
"""
# 1.1 import Package
# general packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')

# ML packages
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

# Model Tune packages
from sklearn.model_selection import GridSearchCV

# CV Folder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

# Stacking Model
import xgboost as xgb
# 1.2 import data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
c = [train,test]
PassengerId = test['PassergerId']

# 1.3 check NAs
print(train.isnull().sum())
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
# 2.1 Basic Visualation of Survived and NAs
_,ax = plt.subplots(1,2)
sns.countplot(train.Survived,ax = ax[0])
(train.isnull().sum()/len(train)).plot(ax =ax[1],kind = 'bar',
    title = 'NAs in dataset')
# More ppl died than survived, for dataset, more than 75% of Cabin is NA and
# 20% of Age is NA and few NAs in Embarked
<matplotlib.axes._subplots.AxesSubplot at 0x7ff61c60a748>

png

# 2.2 Find the important features

# 2.2.1 All numeric Values
sns.heatmap(train.corr(),annot=True)
# Pclass and Fare are seems important
# 2.2.2 Sex
_,ax = plt.subplots(1,2)
sns.countplot(train.Sex,ax =ax[0])
sns.barplot(x = train.Sex,y = train.Survived,ax =ax[1])
# Survived rate is apparently changed for different sex
# str to numeric
for data in c :
    data['Sex'] = data['Sex'].replace(['male','female'],[0,1])

png

# 2.2.3 Parch and SibSp
_,ax = plt.subplots(2,2)
sns.countplot(train.Parch,ax =ax[0,0])
sns.barplot(x = train.Parch,y = train.Survived,ax =ax[0,1])
sns.countplot(train.SibSp,ax =ax[1,0])
sns.barplot(x = train.SibSp,y = train.Survived,ax =ax[1,1])
# two variable are strongly related, considering combine to one variable

# FamilySize and IsAlone
for data in c :
    data['FSize'] = data['Parch'] + data['SibSp'] + 1

# plot
_,ax = plt.subplots(1,2)
sns.countplot(train.FSize,ax =ax[0])
sns.barplot(x = train.FSize,y = train.Survived,ax =ax[1])
# survived rate increased as fsize inscrease till fsize = 4

# group familysize : single , couple(/child), med family and large family
for data in c :
    data['FSize'] = data['FSize'].replace([1,2,3,4,5,6,7,8,11],[1,2,2,3,4,4,4,4,4])
# plot
_,ax = plt.subplots(1,2)
sns.countplot(train.FSize,ax =ax[0])
sns.barplot(x = train.FSize,y = train.Survived,ax =ax[1])
<matplotlib.axes._subplots.AxesSubplot at 0x7ff61cb5e320>

png

png

png

# 2.2.4 Embarked
_,ax = plt.subplots(1,2)
sns.countplot(train.Embarked,ax =ax[0])
sns.barplot(x = train.Embarked,y = train.Survived,ax =ax[1])
# Survived rate is apparently changed for different Embark
# fill na with Max value S
for data in c :
    data['Embarked'] = data.Embarked.fillna('S')
# str to numeric
for data in c :
    data['Embarked'] = data['Embarked'].replace(['S','Q','C'],[0,1,2])

png

# 2.2.5 Title from Name
for data in c:
    data['Title'] = data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# group title
for data in c :
    data['Title'] = data['Title'].replace(['Mme','Ms','Mlle','Lady','Dr','Rev','Col',
        'Major','Don','Mme','Countess','Sir','Jonkheer','Capt','Dona'],
        ['Mrs','Miss','Miss','Mrs','Other','Other','Other','Other','Other','Other',
         'Mrs','Other','Other','Other','Mrs'])
# check average age and survived rate
_,ax = plt.subplots(1,2)
train.groupby('Title')['Age'].mean().plot(kind = 'bar',ax = ax[0],title = 'Avg Age')
train.groupby('Title')['Survived'].mean().plot(kind = 'bar',ax = ax[1],title = 'Survied')
# str to numeric
for data in c :
    data['Title'] = data['Title'].replace(['Mr','Mrs','Miss','Master','Other'],
        [0,1,2,3,4])
# title may has a strong relationship with Age and Sex, may drop in Emsemble

png

# 2.2.6 Age
# age is a skewed distribution with a large tail, I considered to scale the data
# at first , but I found groupize the age will be better choice
g = sns.FacetGrid(col = 'Survived',data = train)
g.map(sns.distplot,'Age')
# imputation Age with mean of different title
for data in c :
    data['Age'] = data.groupby('Title')['Age'].transform(lambda x: x.fillna(x.mean()))
# plot again
g = sns.FacetGrid(col = 'Survived',data = train)
g.map(sns.distplot,'Age')
# range of age is 0-80, we groupize age with 5 groups
for data in c :
    data['Age_Group'] = pd.qcut(data['Age'],5,labels = [0,1,2,3,4]).astype(int)
# check count and survived rate
_,ax = plt.subplots(1,2)
sns.countplot(train.Age_Group,ax = ax[0])
sns.barplot(x = 'Age_Group', y = 'Survived',data = train,ax = ax[1])
<matplotlib.axes._subplots.AxesSubplot at 0x7ff617473390>

png

png

png

# 2.2.7 Cabin
# pickup the captial word of all cabin and fill na with x
for data in c :
    data['Cabin'] = data.Cabin.fillna('X')
    data['Cabin'] = data['Cabin'].map(lambda x : x[0])
# check count and survived rate
_,ax = plt.subplots(1,2)
sns.countplot(train.Cabin,ax = ax[0])
sns.barplot(x = 'Cabin', y = 'Survived',data = train,ax = ax[1])
# Cabin = x has largest value while lowest survived rate
# cabin = a,b,c,d,e,f,t are in Pclass 1, strong relationship with pclass
# cabin means location on Titanic, better place may has a better chance to
# survive only in first shock, if they are in water already, eveyone is same
<matplotlib.axes._subplots.AxesSubplot at 0x7ff61736b7f0>

png

# 2.2.8 Scale the Fare
for data in c :
    data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x != 0 else 0)
# fill na
test['Fare'] = test.groupby(['Pclass','Cabin'])['Fare'].transform(lambda x: x.fillna(x.mean()))

# plot
sns.distplot(train.Fare)

<matplotlib.axes._subplots.AxesSubplot at 0x7ff617324da0>

png

# 2.3 drop unneeded colunms :
drop_col = ['PassengerId','Name','Ticket','Age','SibSp','Parch']
for data in c :
    data.drop(drop_col,inplace = True,axis = 1)

# 2.4 corr heat map
sns.heatmap(train.corr(),annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff61f5f5780>

png

# 2.5 Dummy all variables
train = pd.get_dummies(train,columns = ['Pclass','Cabin','Embarked','FSize',
                                         'Title','Age_Group'])
test = pd.get_dummies(test,columns = ['Pclass','Cabin','Embarked','FSize',
                                         'Title','Age_Group'])
# drop Cabin_T
train.drop('Cabin_T',inplace = True,axis = 1)

print(train.shape)
(891, 31)
# 3. Modling
# Level one model I want to use

        # ---------------------------------------- #
        #   SVM
        #   KNN
        #   Logistic Regression
        #   Random Forest
        #   Naive Bayes
        #   Linear SVC
        #   Decision Tree
        #   AdaBoost classifer
        #   Gradient Boosting classifer
        # ---------------------------------------- #


# 3.1 set up train/ test /traget / data dataset
train_Survived = train['Survived'] # traget
train_data = train.drop('Survived',axis = 1) # data

# 3.2 train test split for scoring
x_train,x_test,y_train,y_test = train_test_split(train_data,train_Survived,
                                               test_size = 0.25,random_state = 13)
# 3.3 set up dataframe to see accuracy
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
              'Random Forest', 'Naive Bayes', 'Linear SVC',
              'Decision Tree', 'AdaBoost classifer',
              'Gradient Boosting Classifier'],
    'Score': [accuracy_score(SVC().fit(x_train,y_train).predict(x_test),y_test),
              accuracy_score(KNeighborsClassifier().fit(x_train,y_train).predict(x_test),y_test),
              accuracy_score(LogisticRegression().fit(x_train,y_train).predict(x_test),y_test),
              accuracy_score(RandomForestClassifier().fit(x_train,y_train).predict(x_test),y_test),
              accuracy_score(GaussianNB().fit(x_train,y_train).predict(x_test),y_test),
              accuracy_score(LinearSVC().fit(x_train,y_train).predict(x_test),y_test),
              accuracy_score(DecisionTreeClassifier().fit(x_train,y_train).predict(x_test),y_test),
              accuracy_score(AdaBoostClassifier().fit(x_train,y_train).predict(x_test),y_test),
              accuracy_score(GradientBoostingClassifier().fit(x_train,y_train).predict(x_test),y_test),
              ]})
models.sort_values(by='Score', ascending=False)
print(models)
g = models.plot(kind = 'bar',title = 'Models Accuracy Score')
g.set_xlabel(list(models['Model']))

                          Model     Score
0       Support Vector Machines  0.834081
1                           KNN  0.807175
2           Logistic Regression  0.852018
3                 Random Forest  0.807175
4                   Naive Bayes  0.811659
5                    Linear SVC  0.838565
6                 Decision Tree  0.798206
7            AdaBoost classifer  0.852018
8  Gradient Boosting Classifier  0.816143





Text(0.5,0,"['Support Vector Machines', 'KNN', 'Logistic Regression', 'Random Forest', 'Naive Bayes', 'Linear SVC', 'Decision Tree', 'AdaBoost classifer', 'Gradient Boosting Classifier']")

png

# 3.4 Increase the accuracy by using CV folders
cv_split = ShuffleSplit(n_splits = 10, test_size = .3,
                                        train_size = .6, random_state = 13)

# model will tested
mod = [SVC(),KNeighborsClassifier(),LogisticRegression(),RandomForestClassifier(),
       GaussianNB(),LinearSVC(),DecisionTreeClassifier(),AdaBoostClassifier(),
       GradientBoostingClassifier()]

# Cross validation
cv_train_score = list()
cv_test_score = list()
for model in mod :
    cv_result = cross_validate(model, train_data, train_Survived, cv  = cv_split)
    cv_train_score.append(cv_result['train_score'].mean())
    cv_test_score.append(cv_result['test_score'].mean())

cv_model = pd.DataFrame({
        'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
                  'Random Forest', 'Naive Bayes', 'Linear SVC',
                  'Decision Tree', 'AdaBoost classifer',
                  'Gradient Boosting Classifier'],
        'CVTrainScore' :cv_train_score,
        'CVTestScore' : cv_test_score})
print(cv_model)
   CVTestScore  CVTrainScore                         Model
0     0.825000      0.834831       Support Vector Machines
1     0.800373      0.867603                           KNN
2     0.825000      0.850000           Logistic Regression
3     0.797388      0.951685                 Random Forest
4     0.771642      0.794007                   Naive Bayes
5     0.821642      0.845880                    Linear SVC
6     0.778731      0.963296                 Decision Tree
7     0.815672      0.855243            AdaBoost classifer
8     0.819776      0.916479  Gradient Boosting Classifier
# 3.5 Hyper parameters tunes
# I try to wirte a function for this tuning, but this seems to be a large
# computation, so I try to tune each model by each
        # use estimator.get_params() to get params

# 3.5.1 SVC / Linear SVC
SVC().get_params()
param_grid = {'C':[1,10,100],
              'gamma': [0.01,0.001],
              'degree': [3,5,7],
              'kernel' :['rbf','linear'],
              'random_state' : [13]}
tuned_svc = GridSearchCV(SVC(),param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tuned_svc.fit(train_data,train_Survived)
print("Best Hyper Parameters:\n",tuned_svc.best_params_)
Best Hyper Parameters:
 {'C': 100, 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf', 'random_state': 13}
# 3.5.2 KNN
KNeighborsClassifier().get_params()
param_grid = {'n_neighbors':[3,5,7,9,11],
              'leaf_size':[1,2,3,5],
              'weights':['uniform', 'distance'],
              'algorithm':['auto'],
              'n_jobs':[-1]}
tuned_knn = GridSearchCV(KNeighborsClassifier(),param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tuned_knn.fit(train_data,train_Survived)
print("Best Hyper Parameters:\n",tuned_knn.best_params_)
Best Hyper Parameters:
 {'algorithm': 'auto', 'leaf_size': 5, 'n_jobs': -1, 'n_neighbors': 9, 'weights': 'uniform'}
# 3.5.3 Logistic Regression
LogisticRegression().get_params()
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'max_iter': [25,50,100,200],
              'random_state': [13],
              'n_jobs': [-1]}
tuned_lr = GridSearchCV(LogisticRegression(),param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tuned_lr.fit(train_data,train_Survived)
print("Best Hyper Parameters:\n",tuned_lr.best_params_)
Best Hyper Parameters:
 {'C': 1, 'max_iter': 25, 'n_jobs': -1, 'random_state': 13}
# 3.5.4 Random Forest
RandomForestClassifier().get_params()
param_grid = {'n_estimators': [10,100,300,500,800,1000],
              'max_features': ['auto'],
              'criterion' : ['gini','entropy'],
              'n_jobs' : [-1],
              'random_state' : [13]}
tuned_rf = GridSearchCV(RandomForestClassifier(),param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tuned_rf.fit(train_data,train_Survived)
print("Best Hyper Parameters:\n",tuned_rf.best_params_)

Best Hyper Parameters:
 {'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 500, 'n_jobs': -1, 'random_state': 13}
# 3.5.5 Decision Tree
DecisionTreeClassifier().get_params()
param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': [2,4,6,8,10],
              'min_samples_split': [2,3,4,5,8],
              'min_samples_leaf': [1,2,3,5,8],
              'max_features': ['auto',None],
              'random_state': [13]}
tuned_tree = GridSearchCV(DecisionTreeClassifier(),param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tuned_tree.fit(train_data,train_Survived)
print("Best Hyper Parameters:\n",tuned_tree.best_params_)
Best Hyper Parameters:
 {'criterion': 'gini', 'max_depth': 4, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 8, 'random_state': 13}
# 3.5.6 AdaBoost classife
AdaBoostClassifier().get_params()
param_grid = {'n_estimators': [500,800,1000,1200],
              'learning_rate':[1,0.9],
              'algorithm' : ['SAMME', 'SAMME.R'],
              'random_state' : [13]}
tuned_ada = GridSearchCV(AdaBoostClassifier(),param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tuned_ada.fit(train_data,train_Survived)
print("Best Hyper Parameters:\n",tuned_ada.best_params_)
Best Hyper Parameters:
 {'algorithm': 'SAMME', 'learning_rate': 1, 'n_estimators': 1500, 'random_state': 13}
# 3.5.7 Gradient Boosting Classifier
GradientBoostingClassifier().get_params()
param_grid = {'learning_rate': [0.1],
              'min_samples_split':[2,5,8],
              'min_samples_leaf':[1],
              'max_depth':[3,7],
              'subsample':[.8],
              'n_estimators':[300,500],
              'random_state':[13]
              }
tuned_gbm = GridSearchCV(GradientBoostingClassifier(),param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tuned_gbm.fit(train_data,train_Survived)
print("Best Hyper Parameters:\n",tuned_gbm.best_params_)
Best Hyper Parameters:
 {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 300, 'random_state': 13, 'subsample': 0.8}
# 3.5.8 cv_result with Hyperparameter Tunes
cv_result = pd.DataFrame({
        'Model' : ['Support Vector Machines/Linear SVC', 'KNN', 'Logistic Regression',
                  'Random Forest', 'Decision Tree', 'AdaBoost classifer',                   
                  'Gradient Boosting Classifier'],
        'Score' : [tuned_svc.best_score_,tuned_knn.best_score_,tuned_lr.best_score_,
                   tuned_rf.best_score_,tuned_tree.best_score_,tuned_ada.best_score_,
                   tuned_gbm.best_score_]})
print(cv_result)
g = cv_result.plot(kind = 'bar',title = 'Models Accuracy Score with Hyperparameter Tunes')
g.set_xlabel(list(cv_result['Model']))
                                Model     Score
0  Support Vector Machines/Linear SVC  0.859236
1                                 KNN  0.852367
2                 Logistic Regression  0.863971
3                       Random Forest  0.852768
4                       Decision Tree  0.848822
5                  AdaBoost classifer  0.863679
6        Gradient Boosting Classifier  0.868139





Text(0.5,0,"['Support Vector Machines/Linear SVC', 'KNN', 'Logistic Regression', 'Random Forest', 'Decision Tree', 'AdaBoost classifer', 'Gradient Boosting Classifier']")

png

# 3.5.9 cv train / test score with parameter tunes
# model will tested
svm_mod = SVC(C = 100,degree = 3, gamma = .01, kernel = 'rbf',random_state = 13)
knn_mod = KNeighborsClassifier(leaf_size = 5,n_neighbors = 9,weights = 'uniform')
lr_mod = LogisticRegression(C= 1,max_iter = 25,random_state = 13)
rf_mod = RandomForestClassifier(criterion = 'entropy',n_estimators = 500,random_state = 13)
tree_mod = DecisionTreeClassifier(criterion = 'gini',max_depth = 4,min_samples_leaf = 3,
                                    min_samples_split = 8,random_state = 13)
ada_mod = AdaBoostClassifier(algorithm = 'SAMME',learning_rate = 1,n_estimators = 1500,
                                random_state = 13)
gbm_mod = GradientBoostingClassifier(learning_rate = .1,max_depth = 3,min_samples_leaf = 1,
                                     min_samples_split = 8,n_estimators = 300,
                                     random_state = 13,subsample = .8)

mod_tuned = [svm_mod,knn_mod,lr_mod,rf_mod,tree_mod,ada_mod,gbm_mod]

# Cross validation
cv_train_score_tuned = list()
cv_test_score_tuned = list()
for model in mod_tuned :
    cv_result = cross_validate(model, train_data, train_Survived, cv  = cv_split)
    cv_train_score_tuned.append(cv_result['train_score'].mean())
    cv_test_score_tuned.append(cv_result['test_score'].mean())

tuned_model = pd.DataFrame({
        'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
                  'Random Forest', 'Decision Tree', 'AdaBoost classifer',
                  'Gradient Boosting Classifier'],
        'CVTrainScore' :cv_train_score_tuned,
        'CVTestScore' : cv_test_score_tuned})
print(tuned_model)
   CVTestScore  CVTrainScore                         Model
0     0.818657      0.854120       Support Vector Machines
1     0.811940      0.855243                           KNN
2     0.825000      0.850000           Logistic Regression
3     0.803358      0.963296                 Random Forest
4     0.817910      0.846816                 Decision Tree
5     0.819776      0.871348            AdaBoost classifer
6     0.822015      0.950375  Gradient Boosting Classifier
# 3.6 select the top 5 modeling for stacking
print('Select model : SVM, KNN, Logistic Regression,Gradient Boosting Classifier,Decision Tree')

Select model : SVM, KNN, Logistic Regression,Gradient Boosting Classifier,Decision Tree
# 3.7 Stacking dataset
# Predict the Survived from train by 5 different models, concatenate the predicted
# survived as new train dataset train_stacking. Same processing to the new test
# dataset test_stacking. train['Survived'] was set to the new train target as
# train_Survived, and the final stacking model is XGBoosting
import random
train_index = random.sample(range(1,len(train)),round(0.6*len(train)))
test_index = random.sample(range(1,len(test)),round(0.6*len(test)))
train_stacking = np.concatenate(
        (svm_mod.fit(train_data.iloc[train_index],train_Survived.iloc[train_index]).predict(train_data).reshape(-1,1),
        knn_mod.fit(train_data.iloc[train_index],train_Survived.iloc[train_index]).predict(train_data).reshape(-1,1),
        lr_mod.fit(train_data.iloc[train_index],train_Survived.iloc[train_index]).predict(train_data).reshape(-1,1),
        tree_mod.fit(train_data.iloc[train_index],train_Survived.iloc[train_index]).predict(train_data).reshape(-1,1),
        gbm_mod.fit(train_data.iloc[train_index],train_Survived.iloc[train_index]).predict(train_data).reshape(-1,1)),
        axis = 1)
test_stacking = np.concatenate(
        (svm_mod.fit(train_data,train_Survived).predict(test).reshape(-1,1),
        knn_mod.fit(train_data,train_Survived).predict(test).reshape(-1,1),
        lr_mod.fit(train_data,train_Survived).predict(test).reshape(-1,1),
        tree_mod.fit(train_data,train_Survived).predict(test).reshape(-1,1),
        gbm_mod.fit(train_data,train_Survived).predict(test).reshape(-1,1)),
        axis = 1)
print(train_stacking.shape)
print(test_stacking.shape)


(891, 5)
(418, 5)
# 3.8 stacking modeling with xgb
xgb_mod = xgb.XGBClassifier(
            #learning_rate = 0.02,
            n_estimators= 2000,
            max_depth= 4,
            min_child_weight= 2,
            gamma=0.9,
            subsample=0.8,
            colsample_bytree=0.8,
            objective= 'binary:logistic',
            nthread= -1,
            scale_pos_weight=1).fit(train_stacking, train_Survived)
predictions = xgb_mod.predict(test_stacking)
# 3.9 Submission File
Submission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
Submission.to_csv("Submission.csv", index=False)