Overview: There are 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa. The Kaggle competition challenges you to predict the final price of each home.
import numpy as np
import pandas as pd
import datetime as d
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
def display_df_info(df_name,my_df,v=False):
print("Data:{}".format(df_name))
print("Data shape:{}".format(my_df.shape))
print("The first few rows of the dataframe:")
print(my_df.head())
if v:
print("Dataframe Info:")
print(my_df.info())
class GetAge(BaseEstimator, TransformerMixin):
def fit(self,X,y=None):
return self
def transform(self,X):
current_year=int(d.datetime.now().year)
X['YearBuilt']=current_year-X.YearBuilt
return X
file_path='train (3).csv'
input_data=pd.read_csv(file_path, index_col=0)
display_df_info("Raw imported data", input_data)
#Separate out the outcome variable from the loaded dataframe
output_var_name='SalePrice'
output_var=input_data[output_var_name]
input_data.drop(output_var_name,axis=1, inplace=True)
# Subsetting the columns: define features to keep
feature_names = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'HouseStyle']
features = input_data[feature_names]
display_df_info('Features before Transform', features, v=True)
#Data Pre-processing
#Define numerical and categorical data
numerical_features=['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
categorical_features=['HouseStyle']
preprocess=make_column_transformer((make_pipeline(GetAge(), SimpleImputer(),StandardScaler()),numerical_features),(OneHotEncoder(),categorical_features)
)
# Combine pre-processing with ML algorithm
pipeline = make_pipeline(
preprocess,
KNeighborsRegressor())
params= {'kneighborsregressor__n_neighbors':range(2,21),
'kneighborsregressor__weights' :['uniform','distance']}
model = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_squared_error')
#Train/Test Split
x_train,x_test,y_train,y_test=train_test_split(features,output_var,test_size=0.3,random_state=42)
# Train the pipeline
model.fit(x_train,y_train)
#Print the best parameters of KNN used
print("Best Parameters chosen:{}".format(model.best_params_))
#Scoring and evaluation of the model
pred_test=model.predict(x_test)
#Display the results of the metrics
rmse=np.sqrt(mean_squared_error(y_test,pred_test))
r2=r2_score(y_test,pred_test)
print("Results on test Data")
print("###############################")
print("RMSE:{:.2f}".format(rmse))
print("R2 Score: {:.5f}".format(r2))
#Compare actual vs predicted values
compare=pd.DataFrame({
'Actual': y_test,
'Predicted':pred_test,
'Difference':y_test-pred_test
})
display_df_info('Actual vs Predicted Comparison', compare)
#Save the model
with open('my_model_knn.joblib', 'wb') as fo:
joblib.dump(model, fo)