Building a KNN Regression Model template based on the Kaggle Ames Housing dataset

Overview: There are 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa. The Kaggle competition challenges you to predict the final price of each home.

Exploratory Data Analysis

Importing necessary libraries

In [1]:
import numpy as np 
import pandas as pd 
import datetime as d

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import joblib

Functions and classes

In [2]:
def display_df_info(df_name,my_df,v=False):
    print("Data:{}".format(df_name))
    print("Data shape:{}".format(my_df.shape))
    print("The first few rows of the dataframe:")
    print(my_df.head())
    
    if v:
        print("Dataframe Info:")
        print(my_df.info())
In [3]:
class GetAge(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        current_year=int(d.datetime.now().year)
        X['YearBuilt']=current_year-X.YearBuilt
        
        return X
In [4]:
file_path='train (3).csv'
input_data=pd.read_csv(file_path, index_col=0)
display_df_info("Raw imported data", input_data)
Data:Raw imported data
Data shape:(1460, 80)
The first few rows of the dataframe:
    MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                    
1           60       RL         65.0     8450   Pave   NaN      Reg   
2           20       RL         80.0     9600   Pave   NaN      Reg   
3           60       RL         68.0    11250   Pave   NaN      IR1   
4           70       RL         60.0     9550   Pave   NaN      IR1   
5           60       RL         84.0    14260   Pave   NaN      IR1   

   LandContour Utilities LotConfig  ... PoolArea PoolQC Fence MiscFeature  \
Id                                  ...                                     
1          Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
2          Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   
3          Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
4          Lvl    AllPub    Corner  ...        0    NaN   NaN         NaN   
5          Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   

   MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
Id                                                             
1        0      2    2008        WD         Normal     208500  
2        0      5    2007        WD         Normal     181500  
3        0      9    2008        WD         Normal     223500  
4        0      2    2006        WD        Abnorml     140000  
5        0     12    2008        WD         Normal     250000  

[5 rows x 80 columns]
In [5]:
#Separate out the outcome variable from the loaded dataframe
output_var_name='SalePrice'
output_var=input_data[output_var_name]
input_data.drop(output_var_name,axis=1, inplace=True)
In [6]:
# Subsetting the columns: define features to keep
feature_names = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'HouseStyle']
features = input_data[feature_names]
display_df_info('Features before Transform', features, v=True)
Data:Features before Transform
Data shape:(1460, 8)
The first few rows of the dataframe:
    LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
Id                                                                   
1      8450       2003       856       854         2             3   
2      9600       1976      1262         0         2             3   
3     11250       2001       920       866         2             3   
4      9550       1915       961       756         1             3   
5     14260       2000      1145      1053         2             4   

    TotRmsAbvGrd HouseStyle  
Id                           
1              8     2Story  
2              6     1Story  
3              6     2Story  
4              7     2Story  
5              9     2Story  
Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 8 columns):
LotArea         1460 non-null int64
YearBuilt       1460 non-null int64
1stFlrSF        1460 non-null int64
2ndFlrSF        1460 non-null int64
FullBath        1460 non-null int64
BedroomAbvGr    1460 non-null int64
TotRmsAbvGrd    1460 non-null int64
HouseStyle      1460 non-null object
dtypes: int64(7), object(1)
memory usage: 102.7+ KB
None

Data Pre-processing

In [7]:
#Data Pre-processing
#Define numerical and categorical data
numerical_features=['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

categorical_features=['HouseStyle']
In [8]:
preprocess=make_column_transformer((make_pipeline(GetAge(), SimpleImputer(),StandardScaler()),numerical_features),(OneHotEncoder(),categorical_features)
)
In [9]:
# Combine pre-processing with ML algorithm
pipeline = make_pipeline(
        preprocess,
        KNeighborsRegressor())
    
params= {'kneighborsregressor__n_neighbors':range(2,21), 
               'kneighborsregressor__weights' :['uniform','distance']}
In [10]:
model = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_squared_error') 

#Training of Dataset

In [11]:
#Train/Test Split
x_train,x_test,y_train,y_test=train_test_split(features,output_var,test_size=0.3,random_state=42)
In [12]:
# Train the pipeline
model.fit(x_train,y_train)
Out[12]:
GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('getage',
                                                                                          GetAge()),
                                                                                         ('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                                                                                        fill_value=None,
                                                                                                        missing_value...
                                                            metric='minkowski',
                                                            metric_params=None,
                                                            n_jobs=None,
                                                            n_neighbors=5, p=2,
                                                            weights='uniform'))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'kneighborsregressor__n_neighbors': range(2, 21),
                         'kneighborsregressor__weights': ['uniform',
                                                          'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)
In [13]:
#Print the best parameters of KNN used
print("Best Parameters chosen:{}".format(model.best_params_))
Best Parameters chosen:{'kneighborsregressor__n_neighbors': 9, 'kneighborsregressor__weights': 'distance'}
In [14]:
#Scoring and evaluation of the model

pred_test=model.predict(x_test)
In [15]:
#Display the results of the metrics

rmse=np.sqrt(mean_squared_error(y_test,pred_test))
In [16]:
r2=r2_score(y_test,pred_test)
In [17]:
print("Results on test Data")
print("###############################")
print("RMSE:{:.2f}".format(rmse))
print("R2 Score: {:.5f}".format(r2))
Results on test Data
###############################
RMSE:35308.51
R2 Score: 0.82134
In [18]:
#Compare actual vs predicted values

compare=pd.DataFrame({
   'Actual': y_test,
    'Predicted':pred_test,
    'Difference':y_test-pred_test
})

display_df_info('Actual vs Predicted Comparison', compare)
Data:Actual vs Predicted Comparison
Data shape:(438, 3)
The first few rows of the dataframe:
      Actual      Predicted    Difference
Id                                       
893   154500  138620.491150  15879.508850
1106  325000  299936.109221  25063.890779
414   115000  114889.267404    110.732596
523   159000  131016.817022  27983.182978
1037  315500  304926.294370  10573.705630
In [19]:
#Save the model

with open('my_model_knn.joblib', 'wb') as fo:  
    joblib.dump(model, fo)
In [ ]: