Regularized Linear Regression with Model Stacking

  Demo in House Price Data

Posted by Haby on October 10, 2017

Part I : Analysis Preparation

# import packages

# basic packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random

# general packages
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# model packages
from sklearn import linear_model
from sklearn.linear_model import ElasticNet, Lasso,  Ridge, LassoLarsIC, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost
from sklearn.model_selection import train_test_split

# Model Tune packages
from sklearn.model_selection import GridSearchCV

# CV Folder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
# load train/test data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
df = pd.concat((train, test)).reset_index(drop=True)

# basic info
print('Shape of Train: ',train.shape)
print('-'*40)
print('Shape of Test: ',test.shape)
print('-'*40)
print(train.columns)
print('-'*40)
print(test.columns)
Shape of Train:  (1460, 81)
----------------------------------------
Shape of Test:  (1459, 80)
----------------------------------------
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')
----------------------------------------
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition'],
      dtype='object')

The traget is saleprice, so we analyze it first. it seems that the plot is not normal distribution since it has left skewness with large right tail. I will get skewness and kurtosis to comfirm this. If so, I will try to scale the data for further analysis.

# saleprice
g = sns.distplot(train.SalePrice,color = 'g')
g.set_title('Sale Price Distribution')
plt.show()

png

# scale sale price
df['SalePrice'] = np.log1p(df['SalePrice'])

# plot and cjeck
g1 = sns.distplot(df[:1460].SalePrice,color = 'g')
g1.set_title('Scaled Sale Price Distribution')

plt.show()

png

# skewness and kurtosis
print('Skewness is: ', train.SalePrice.skew())
print('-'*40)
print('Kurtosis is: ',train.SalePrice.kurt())
Skewness is:  1.88287575977
----------------------------------------
Kurtosis is:  6.53628186006

Part 2 : EDA

  1. General Visualization
# Using Corraltion matrix to analyze numeric data first
_,ax = plt.subplots(figsize=(14,12))
sns.heatmap(train.drop('Id',axis = 1).corr(),cmap = 'Greens')
ax.set_title('SalePrice vs All Numeric Variables \n The darker color, the higher relationship.')
plt.show()

png

# list relationship of all numeric variables with SalePrice
_,ax = plt.subplots(figsize = (14,12))
g = train.drop(['Id'],axis = 1).corr()['SalePrice'].sort_values().plot.barh(color = 'g')
ax.set_title('Relationship of Survived vs All Numeric Variables')

# add annotation
for p in g.patches:
    g.annotate(str(round(p.get_width(),2)), (p.get_width() * 1.01,p.get_y()))

plt.show()

png

2.Imputation Missing Values

# plot all missing values
_,ax = plt.subplots(1,2,figsize = (14,8))
g1 = df.isnull().sum().sort_values(ascending = False).head(30).plot.barh(ax = ax[0],color = 'g')
g2 = train.isnull().sum().sort_values(ascending = False).head(20).plot.barh(ax = ax[1],color = 'g')
g1.set_title('Missing Values for All Data')
g2.set_title('Missing Values fro Train')

# add annotation
for p in g1.patches:
    g1.annotate(str(round(p.get_width(),2)), (p.get_width() * 1.01,p.get_y()))

plt.show()

png

# drop columns with more than 600 missings values, eg ('PoolQC','MiscFeature','Alley','Fence','FireplaceQu')
df = df.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis = 1)
print(df.shape)
(2919, 76)
  • Fill na of LotFrontage with neighbourhood since houses in same neighbor should have same distince to road.

  • Garage group : there are 157 houses don’t have any info of garage, I’ m suppose that they dont have garage, while 2 of them have the garage type info, I will try to imputate them.

  • Basement group : 79 of them dont have any basement info, I’m supposed they dont have basement, and impute others.

  • MasVnr group : 23 of them fill na with None and imputate the last one.

  • Imputate other individuals

# LotFrontage : immutate with neighborhood variables
df['LotFrontage'] = df.groupby(['Neighborhood'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    df[col] = df[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    df[col] = df[col].fillna(0)
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    df[col] = df[col].fillna(0)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    df[col] = df[col].fillna('None')
df["MasVnrType"] = df["MasVnrType"].fillna("None")
df["MasVnrArea"] = df["MasVnrArea"].fillna(0)
# MSZoning
print(df['MSZoning'].describe()) # top is RL
df['MSZoning'] = df['MSZoning'].fillna('RL')
count     2915
unique       5
top         RL
freq      2265
Name: MSZoning, dtype: object
# Utilities
df['Utilities'].value_counts() # there are 2916 allpub and 1 Nosewa and 2 NAs, drop this variable
df = df.drop('Utilities',axis = 1)
# Functional
print(df['Functional'].value_counts()) # top is Typ
df['Functional'] = df['Functional'].fillna('Typ')
Typ     2717
Min2      70
Min1      65
Mod       35
Maj1      19
Maj2       9
Sev        2
Name: Functional, dtype: int64
# Exterior1st / Exterior2nd
print(df['Exterior1st'].value_counts()) # top is VinylSd and ImStucc only has 1 row, can be deleted when get_dummies
print(df['Exterior2nd'].value_counts()) # top is VinylSd and Other only has 1 row, can be deleted when get_dummies
df['Exterior1st'] = df['Exterior1st'].fillna('VinylSd')
df['Exterior2nd'] = df['Exterior2nd'].fillna('VinylSd')
VinylSd    1025
MetalSd     450
HdBoard     442
Wd Sdng     411
Plywood     221
CemntBd     126
BrkFace      87
WdShing      56
AsbShng      44
Stucco       43
BrkComm       6
Stone         2
AsphShn       2
CBlock        2
ImStucc       1
Name: Exterior1st, dtype: int64
VinylSd    1014
MetalSd     447
HdBoard     406
Wd Sdng     391
Plywood     270
CmentBd     126
Wd Shng      81
BrkFace      47
Stucco       47
AsbShng      38
Brk Cmn      22
ImStucc      15
Stone         6
AsphShn       4
CBlock        3
Other         1
Name: Exterior2nd, dtype: int64
# KitchenQual
print(df['KitchenQual'].value_counts())
df['KitchenQual'] = df['KitchenQual'].fillna('TA')
TA    1492
Gd    1151
Ex     205
Fa      70
Name: KitchenQual, dtype: int64
# SaleType
print(df['SaleType'].value_counts())
df['SaleType'] = df['SaleType'].fillna('WD')
WD       2525
New       239
COD        87
ConLD      26
CWD        12
ConLI       9
ConLw       8
Oth         7
Con         5
Name: SaleType, dtype: int64
# Electrical
print(df['Electrical'].value_counts()) # top is SBrkr and Mix has only 1 row, deletes when get_dummies
df['Electrical'] = df['Electrical'].fillna('SBrkr')
SBrkr    2671
FuseA     188
FuseF      50
FuseP       8
Mix         1
Name: Electrical, dtype: int64
_,ax = plt.subplots(figsize = (8,6))
g1 = df.isnull().sum().sort_values(ascending = False).head(30).plot.barh(color = 'g')
g1.set_title('Missing Values for All Data')
for p in g1.patches:
    g1.annotate(str(round(p.get_width(),2)), (p.get_width() * 1.01,p.get_y()))   
plt.show()

png

3.Outliers

_,ax = plt.subplots(3,2,figsize = (14,14))
sns.regplot(x = df.GrLivArea,y = df.SalePrice,ax = ax[0,0])
sns.regplot(x = df.TotalBsmtSF,y = df.SalePrice,ax = ax[0,1])
sns.boxplot(x = df.OverallCond,y = df.SalePrice,ax = ax[1,0])
sns.boxplot(x = df.OverallQual,y = df.SalePrice,ax = ax[1,1])
sns.regplot(x = df['1stFlrSF'],y = df.SalePrice,ax = ax[2,0])
sns.regplot(x = df['2ndFlrSF'],y = df.SalePrice, ax = ax[2,1])
  <matplotlib.axes_subplots.AxesSubplot at 0x7fd57be72390>

png

  • Two Outliers in GrLivArea since they are too far away from fitted line.
  • One outlier in TotalBsmtSF. If this house is the same one who is outlier in GrlivArea,then can be considerd as a really large house while in a bad location so the price is not so high as regular.
  • One Outlier in OverallCond = 2 while it doesnt exist in OverallQual, means this house in low OverallCond butin a high OverallQual, which doesnt make sense based on the relationship of OverallCond and OverallQual, considered it as outlier.
  • One Outlier in 1stFlrSF, i ‘m supposed to consider it as the same house. Delete it if so.
  • For 2ndFlrSF, it doesn’t fit so well.
_,ax = plt.subplots(2,2,figsize = (14,10))
sns.boxplot(y = df.GarageArea,x = df.GarageCars,ax = ax[1,0])
sns.regplot(x = df['GarageArea'],y = df.SalePrice, ax = ax[0,1])
sns.boxplot(y = df.SalePrice,x = df.GarageCars,ax = ax[0,0])
sns.regplot(x = df['GarageArea']/df.GarageCars,y = df.SalePrice, ax = ax[1,1])

ax[0,0].set_title('GarageCars vs SalePrice')
ax[0,1].set_title('GarageAarea vs SalePrice')
ax[1,0].set_title('GarageCars vs GarageArea')
ax[1,1].set_title('GarageCars vs GarageArea Per Cars')
Text(0.5,1,'GarageCars vs GarageArea Per Cars')

png

  • Garage cars = 4, the saleprice suddenly dropped. And in test dataset there are some garagecars = 5 while there are none in train dataset.
  • There are 4 extremely points in GarageArea plot.
  • GarageArea and garagecars have strongly relationship with each other.
  • Most of area for each cars are in 150 - 450 feets, while there are house with large area for each car
# drop outliers from GrLivArea and TotalBsmtSF plot
df = df.drop(df[df['Id'] == 1299].index)
df = df.drop(df[df['Id'] == 524].index)
# watch out there is one extremely point in test data too.

4.Scale data

scaled = ["1stFlrSF","2ndFlrSF","3SsnPorch","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","EnclosedPorch",
          "GarageArea","GrLivArea","LotArea","LotFrontage","LowQualFinSF","MasVnrArea","MiscVal",
          "OpenPorchSF","PoolArea","ScreenPorch","TotalBsmtSF", "WoodDeckSF"]
# ckech kurtosis and skewness
def skew_kur(df) :
    # select numeric variables
    num = []
    for c in df.columns :
        if c in scaled:
            num.append(c)
    # skewness
    skewness = []
    for c in num :
        skewness.append(df[c].skew())
    # kurtosis  
    kurtosis = []
    for c in num :
        kurtosis.append(df[c].kurtosis())
    # norm dataframe
    norm = pd.DataFrame({'Variable' : num,
                        'skewness' : skewness,
                        'kurtosis' : kurtosis})
    return norm

norm = skew_kur(df)
skew_kur(df)
Variable kurtosis skewness
0 1stFlrSF 5.075293 1.257933
1 2ndFlrSF -0.424185 0.861999
2 3SsnPorch 149.304586 11.377932
3 BsmtFinSF1 1.427134 0.981149
4 BsmtFinSF2 18.828682 4.146636
5 BsmtUnfSF 0.403042 0.920161
6 EnclosedPorch 28.358039 4.004404
7 GarageArea 0.864865 0.216968
8 GrLivArea 2.456625 1.069300
9 LotArea 275.639934 13.116240
10 LotFrontage 8.526991 1.103332
11 LowQualFinSF 174.810242 12.090757
12 MasVnrArea 9.457156 2.623068
13 MiscVal 563.687542 21.950962
14 OpenPorchSF 11.021266 2.530660
15 PoolArea 327.027992 17.697766
16 ScreenPorch 17.761714 3.947131
17 TotalBsmtSF 3.711566 0.672097
18 WoodDeckSF 6.750566 1.845741
# consider -0.8 to 0.8 for skewness and -3.0 to 3.0 for kurtosis as the acceptable ranege to ensure 95% CI.
temp = norm[(abs(norm['kurtosis']) > 3) | (abs(norm['skewness']) > 0.8)]['Variable']

df[temp] = df[temp].apply(lambda x: np.log1p(x))
# double check
skew_kur(df)
# there are still some extremely value but much better
Variable kurtosis skewness
0 1stFlrSF 0.043907 0.030374
1 2ndFlrSF -1.886414 0.306786
2 3SsnPorch 76.527700 8.826656
3 BsmtFinSF1 -1.468428 -0.616808
4 BsmtFinSF2 4.248768 2.462526
5 BsmtUnfSF 3.953316 -2.155250
6 EnclosedPorch 1.970906 1.960960
7 GarageArea 0.864865 0.216968
8 GrLivArea 0.103405 -0.022062
9 LotArea 3.751081 -0.532920
10 LotFrontage 2.744406 -1.069416
11 LowQualFinSF 72.258633 8.559041
12 MasVnrArea -1.584435 0.538731
13 MiscVal 25.968860 5.214687
14 OpenPorchSF -1.773013 -0.041559
15 PoolArea 243.656301 15.631314
16 ScreenPorch 6.755490 2.946085
17 TotalBsmtSF 25.569296 -4.966774
18 WoodDeckSF -1.892872 0.159605

6.Add Features

  • if house has Pool
  • if house has garage
  • Year/ month sold influence
  • Neighborhood class
# HasPool : if poolarea != 0 then haspool = 1
df['HasPool'] = 0
df[df.PoolArea != 0]['HasPool'] = 1

# HasGarage : if garageyrblt =0 then hasgarage = 0
df['HasGarage'] = 0
df[df.GarageYrBlt != 0]['HasGarage'] = 1
# Month
_,ax = plt.subplots(1,2,figsize = (14,6))
sns.countplot(df.MoSold,ax =ax[0])
sns.boxplot(x = df.MoSold,y = df.SalePrice,ax = ax[1])
<matplotlib.axes_subplots.AxesSubplot at 0x7fd5d5a92f60>

png

# Built year
_,ax = plt.subplots(3,1,figsize = (14,10))
sns.regplot(x = df.YearBuilt.astype(int),y = df.SalePrice,ax = ax[0])
sns.boxplot(x = df.YearBuilt.astype(int),y = df.SalePrice,ax =ax[1])
sns.countplot(df.YearBuilt.astype(int),ax =ax[2])
<matplotlib.axes_subplots.AxesSubplot at 0x7fd5d57f7d68>

png

_,ax = plt.subplots(1,2,figsize = (14,6))
sns.countplot(df.YrSold,ax = ax[0])
sns.regplot(x = df.YrSold.astype(int),y = df.SalePrice,ax = ax[1] )
<matplotlib.axes_subplots.AxesSubplot at 0x7fd57ba0d128>

png

  • In May, June and July, more houses sold, considered them as ‘HotMon’ while price is almost same
  • The newer the houses, the more expensive they are.And more houses were built after 2000.
  • The amount of houses sold in each houses were almost same while the trend of the saleprice descreased.
# Hot Mon
df['HotMon'] = 0
df[df.MoSold == 5|6|7]['HotMon'] =1

# HouseAge : YearSold - YearBuilt
df['HouseAge'] = df.YrSold.astype(int) - df.YearBuilt.astype(int)
# Neighborhood
_,ax = plt.subplots(2,1,figsize = (14,8))
sns.countplot(df.Neighborhood,ax = ax[0])
sns.boxplot(x = df.Neighborhood,y = df.SalePrice,ax = ax[1])
# consider average of each Neighborhood > 12.5 as 'ClassI' and others as 'Others'
print(df.groupby('Neighborhood')['SalePrice'].mean().sort_values(ascending = False).head())

# classI
df['ClassI'] = 0
df[(df.Neighborhood == 'NoRidge')|(df.Neighborhood =='NridgHt')|(df.Neighborhood =='StoneBr')]['ClassI'] = 1
Neighborhood
NoRidge    12.676003
NridgHt    12.619415
StoneBr    12.585490
Timber     12.363460
Veenker    12.344180
Name: SalePrice, dtype: float64
# TotalArea = basement + 1stfloor + 2ndfloor
df['TotalArea'] = df['1stFlrSF'] +df['2ndFlrSF'] +df['TotalBsmtSF']

# street tp 0/1
df.Street.replace(['Grvl', 'Pave'],[0,1],inplace = True)

7.Get Dummies

df_dummy = pd.get_dummies(df)
print(df_dummy.shape)
(2917, 283)
# drop some columns
# Exterior1st :ImStucc
# Exterior2nd :Other
# Electrical :Mix
df_dummy.drop(['Exterior1st_ImStucc','Exterior2nd_Other','Electrical_Mix'],axis = 1,inplace = True)
df_dummy.shape
(2917, 280)
# seperate train and test
train_dummy = df_dummy[:train.shape[0]-2] # 2 outliers
test_dummy = df_dummy[train.shape[0]-2:]
print(train_dummy.shape,test_dummy.shape)
(1458, 280) (1459, 280)

Part 3: Modeling

Level one model I want to use

 ----------------------------------------
   Linear
   LASSO
   Ridge
   Elastic Net
   Random Forest
   Gradient Boosting classifer
   XGBoost
 ----------------------------------------
# set up train/ test /traget / data dataset
train_Price = train_dummy['SalePrice'] # target
train_data = train_dummy.drop(['SalePrice','Id'],axis = 1) # data
print(train_Price.shape,train_data.shape)
(1458,) (1458, 278)
# train test split for scoring
x_train,x_test,y_train,y_test = train_test_split(train_data,train_Price,
                                               test_size = 0.25,random_state = 13)
print(x_train.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1093 entries, 669 to 338
Columns: 278 entries, 1stFlrSF to SaleType_WD
dtypes: float64(24), int64(19), uint8(235)
memory usage: 626.6 KB
None
# set up dataframe to see rmse
models = pd.DataFrame({
    'Model': ['Linear', 'Lasso', 'Ridge',
              'Elastic Net', 'Gradient Boosting', 'Random Forest',
              'Xgboost'],
    'Result': [ np.sqrt(mean_squared_error(LinearRegression().fit(x_train,y_train).predict(x_test),y_test)),
              np.sqrt(mean_squared_error(Lasso().fit(x_train,y_train).predict(x_test),y_test)),
              np.sqrt(mean_squared_error(Ridge().fit(x_train,y_train).predict(x_test),y_test)),
              np.sqrt(mean_squared_error(ElasticNet().fit(x_train,y_train).predict(x_test),y_test)),
              np.sqrt(mean_squared_error(GradientBoostingRegressor().fit(x_train,y_train).predict(x_test),y_test)),
              np.sqrt(mean_squared_error(RandomForestRegressor().fit(x_train,y_train).predict(x_test),y_test)),
              np.sqrt(mean_squared_error(xgboost.XGBRegressor().fit(x_train,y_train).predict(x_test),y_test))
              ]})
print(models.sort_values(by='Result'))
g = models.plot(kind = 'bar',title = 'Models RMSE Score \n The Lower the Better')
g.set_xlabel(list(models['Model']))
               Model    Result
2              Ridge  0.119638
0             Linear  0.125658
4  Gradient Boosting  0.136083
6            Xgboost  0.139107
5      Random Forest  0.164620
3        Elastic Net  0.253736
1              Lasso  0.258744





Text(0.5,0,"['Linear', 'Lasso', 'Ridge', 'Elastic Net', 'Gradient Boosting', 'Random Forest', 'Xgboost']")

png

  • RMSE is not so good as I think, consider CV folder with SearchGrid for better result.
  • L2 regression is better than L1 regression
  • Hyper Parameter Tuning for better results and ensemble models.
# split cv folder
cv_split = ShuffleSplit(n_splits = 10, test_size = .3,
                                        train_size = .6, random_state = 13)

# models will test
mod = [LinearRegression(),Lasso(),Ridge(),ElasticNet(),GradientBoostingRegressor(),RandomForestRegressor(),xgboost.XGBRegressor()]

# Cross validation
cv_score = list()
for model in mod :
    cv_result = np.sqrt(-cross_val_score(model, train_data, train_Price, cv  = cv_split,scoring='mean_squared_error'))
    cv_score.append(cv_result.mean())

cv_model = pd.DataFrame({
        'Model': ['Linear', 'Lasso', 'Ridge',
              'Elastic Net', 'Gradient Boosting', 'Random Forest',
              'Xgboost'],
        'CV Result' :cv_score})
print(cv_model.sort_values(by='CV Result'))
   CV Result              Model
2   0.119145              Ridge
0   0.129010             Linear
6   0.130008            Xgboost
4   0.130840  Gradient Boosting
5   0.153474      Random Forest
3   0.263963        Elastic Net
1   0.268069              Lasso
  • Hyper parameters tunes
# Lasso
Lasso().get_params()
param_grid = {'alpha' : [0.0005,0.005,0.05,0.2,0.4],
              'tol': [0.1,0.01,0.001, 0.0001],
              'random_state' : [13]}
tuned_lasso = GridSearchCV(Lasso(),param_grid=param_grid, scoring = 'neg_mean_squared_error', cv = cv_split)
tuned_lasso.fit(train_data,train_Price)
print("Best Hyper Parameters:\n",tuned_lasso.best_params_)
Best Hyper Parameters:
 {'alpha': 0.0005, 'random_state': 13, 'tol': 0.01}
# Ridge
Ridge().get_params()
param_grid = {'alpha' : [0.005,0.05,0.2,0.4,0.6,1],
              'tol': [0.1,0.01,0.001, 0.0001],
              'random_state' : [13]}
tuned_ridge = GridSearchCV(Ridge(),param_grid=param_grid, scoring = 'neg_mean_squared_error', cv = cv_split)
tuned_ridge.fit(train_data,train_Price)
print("Best Hyper Parameters:\n",tuned_ridge.best_params_)
Best Hyper Parameters:
 {'alpha': 1, 'random_state': 13, 'tol': 0.1}
# ElasticNet
ElasticNet().get_params()
param_grid = {'alpha' : [0.1,1],
              'l1_ratio' :[0,0.4,0.7,1],
              'tol': [0.1,0.01],
              'random_state' : [13]}
tuned_elnet = GridSearchCV(ElasticNet(),param_grid=param_grid, scoring = 'neg_mean_squared_error', cv = cv_split)
tuned_elnet.fit(train_data,train_Price)
print("Best Hyper Parameters:\n",tuned_ridge.best_params_)
Best Hyper Parameters:
 {'alpha': 1, 'random_state': 13, 'tol': 0.1}
# GradientBoostingRegressor
GradientBoostingRegressor().get_params()
param_grid = {'learning_rate': [0.01],
              'min_samples_split':[5],
              'min_samples_leaf':[5],
              'max_depth':[3],
              'max_features':['sqrt'],
              'subsample':[.5,],
              'n_estimators':[3000,4000],
              'random_state':[13]}
tuned_gbm = GridSearchCV(GradientBoostingRegressor(),param_grid=param_grid, scoring = 'neg_mean_squared_error', cv = cv_split)
tuned_gbm.fit(train_data,train_Price)
print("Best Hyper Parameters:\n",tuned_gbm.best_params_)
Best Hyper Parameters:
 {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 4000, 'random_state': 13, 'subsample': 0.5}
# RandomForestRegressor
RandomForestRegressor().get_params()
param_grid = {'n_estimators': [2000,3000],
              'max_features': ['sqrt'],
              'criterion' : ['mse'],
              'n_jobs' : [-1],
              'random_state' : [13]}
tuned_rf = GridSearchCV(RandomForestRegressor(),param_grid=param_grid, scoring = 'neg_mean_squared_error', cv = cv_split)
tuned_rf.fit(train_data,train_Price)
print("Best Hyper Parameters:\n",tuned_rf.best_params_)
Best Hyper Parameters:
 {'criterion': 'mse', 'max_features': 'sqrt', 'n_estimators': 3000, 'n_jobs': -1, 'random_state': 13}
# xgboost.XGBRegressor
xgboost.XGBRegressor().get_params()
param_grid = {"colsample_bytree" : [0.4],
              "gamma" : [0.4],
              "learning_rate" : [0.05],
              "max_depth" : [5],              
              "min_child_weight":[2],
              "n_estimators" : [2500],
              "subsample" : [0.6],
              "random_state" :[13],
              "nthread" : [-1]}
tuned_xgb = GridSearchCV(xgboost.XGBRegressor(),param_grid=param_grid, scoring = 'neg_mean_squared_error', cv = cv_split)
tuned_xgb.fit(train_data,train_Price)
print("Best Hyper Parameters:\n",tuned_xgb.best_params_)
Best Hyper Parameters:
 {'colsample_bytree': 0.4, 'gamma': 0.4, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 2500, 'nthread': -1, 'random_state': 13, 'subsample': 0.6}
# model will tested
ln_mod = LinearRegression()
las_mod = Lasso(alpha = 0.0005,tol = 0.01,random_state = 13)
rid_mod = Ridge(alpha = 0.1,tol = 0.1,random_state = 13)
enet_mod = ElasticNet(alpha = 0.1,tol = 0.1,random_state = 13)
gbm_mod = GradientBoostingRegressor(max_features = 'sqrt',n_estimators = 4000,random_state = 13,
                                learning_rate = 0.01,max_depth = 3,min_samples_leaf = 5,min_samples_split = 5,subsample = 0.5)
rf_mod = RandomForestRegressor(criterion = 'mse',max_features = 'sqrt',n_estimators = 3000,
                                    min_samples_split = 8,random_state = 13)
xgb_mod = xgboost.XGBRegressor(colsample_bytree = .4,max_depth = 5,learning_rate = 0.05,
                                     n_estimators = 2500,gamma = 0.4,
                                     random_state = 13,subsample = .6, min_child_weight = 2)

mod_tuned = [ln_mod,las_mod,rid_mod,enet_mod,gbm_mod,rf_mod,xgb_mod]

# Cross validation
cv_score = list()
for model in mod_tuned :
    cv_result = np.sqrt(-cross_val_score(model, train_data, train_Price, cv  = cv_split,scoring='mean_squared_error'))
    cv_score.append(cv_result.mean())

tuned_cv_model = pd.DataFrame({
                                'Model': ['Linear','Lasso', 'Ridge',
                                      'Elastic Net', 'Gradient Boosting', 'Random Forest',
                                      'Xgboost'],
                                'Tuned CV Result' :cv_score})
print(tuned_cv_model)
               Model  Tuned CV Result
0             Linear         0.129010
1              Lasso         0.114394
2              Ridge         0.126004
3        Elastic Net         0.179632
4  Gradient Boosting         0.117798
5      Random Forest         0.148650
6            Xgboost         0.137446
_,ax = plt.subplots(figsize=(8,6))
g = tuned_cv_model.plot(ax = ax)
cv_model.plot(ax = g)
models.plot(ax = g)
g.set_title('Tuned Result vs CV result vs Original Result')
g.set_xlabel(list(models['Model']))
Text(0.5,0,"['Linear', 'Lasso', 'Ridge', 'Elastic Net', 'Gradient Boosting', 'Random Forest', 'Xgboost']")

png

  • Hyper Parameter tuned results are much better than original result.
  • Regularized linear regression is better than boosting model.
  • Best results are Lasso and GBM.
  • A little confused that CV result is worse than original result.
  • Considering model ensembling / stacking before, but lasso / enet / ridge models are very similar, try simple combination first.
# Lasso Ridge GBM
# m1 = 0.6*lasso + 0.35*gbm + 0.05*ridge
m1 = 0.6*las_mod.fit(x_train,y_train).predict(x_test) + 0.35*gbm_mod.fit(x_train,y_train).predict(x_test) + 0.05*rid_mod.fit(x_train,y_train).predict(x_test)
r = np.sqrt(mean_squared_error(m1,y_test))
print('Combined Model RMSE is : ',r)
Combined Model RMSE is :  0.113704463697
  • Simple combined model is better than any single model we have (5%).
  • Try stacking model next.
# combined results from 6 single models together
train_index = random.sample(range(1,len(train_data)),round(0.6*len(train_data)))
test_index = random.sample(range(1,len(test)),round(0.6*len(test)))
t_train_stacking = np.vstack(
    (las_mod.fit(train_data.iloc[train_index],train_Price.iloc[train_index]).predict(train_data),
     rid_mod.fit(train_data.iloc[train_index],train_Price.iloc[train_index]).predict(train_data),
     enet_mod.fit(train_data.iloc[train_index],train_Price.iloc[train_index]).predict(train_data),
     gbm_mod.fit(train_data.iloc[train_index],train_Price.iloc[train_index]).predict(train_data),
     rf_mod.fit(train_data.iloc[train_index],train_Price.iloc[train_index]).predict(train_data),
     xgb_mod.fit(train_data.iloc[train_index],train_Price.iloc[train_index]).predict(train_data)))
train_stacking = pd.DataFrame(t_train_stacking.T)
train_stacking.shape
(1458, 6)
# split into train and test part
x_train_stacking,x_test_stacking,y_train_stacking,y_test_stacking = train_test_split(train_stacking,train_Price,
                                                                                    test_size = .3,random_state = 13)
# Using xgb as Level 2 model
l2_mod.fit(x_train_stacking,y_train_stacking)
predictions = l2_mod.predict(x_test_stacking)
print('Level 2 Modeling RMSE Score:',np.sqrt(mean_squared_error(predictions,y_test_stacking)))
Level 2 Modeling RMSE Score: 0.0938204230879
  • Stacking Model result is better than any single model (around 20%).

  • For Further Analysis :

  • 1.Covert all numeric variables to dummy variables (1/0)

  • 2.Make better Hyper-Parameter-Tuning(Need better computer with more cores).

  • 3.Consider other algorithms.

  • 4.Set level 3 Stacking.