There are multiple factors affecting the selling price of HDB resale flats in Singapore. My objective is to construct a regression model to estimate how the selling price of a HDB resale flat changes based on its distance to the CBD and the nearest MRT station, as well as its flat size, floor level and the years of lease remaining on the flat.
To predict selling price of a HDB resale flat with a regression model, based on the explanatory variables given: (1)distance from CBD (2)distance from nearest MRT station (3)flat size (4)floor level (5)years of lease remaining
There is only 1 target output variable, which is the resale price of the HDB which is what we would like to predict. Previously, I have built a Python tool to extract the Longitute and Latitude of the MRT station and CDB region from the onemap.sg API.
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#extract longitude/latitude information based on street name
import json
import requests
import openpyxl
import time
# Open Excel sheet
wb = openpyxl.load_workbook('out.xlsx')
#sheet = wb['out']
sheet = wb.active
sheet['B1'] = 'LONGITUDE'
sheet['C1'] = 'LATITUDE'
for irow in range(2, sheet.max_row+1):
street_name = sheet['A' + str(irow)].value
query_string = 'https://developers.onemap.sg/commonapi/search?searchVal=' + str(
street_name) + '&returnGeom=Y&getAddrDetails=N&pageNum=1'
result = requests.get(query_string)
# Convert JSON into Python Object
data = json.loads(result.content)
if data['results'] == []: ## check if the longitude and latitude can be found, if not, update to NA
sheet['B' + str(irow)].value = 'NA'
sheet['C' + str(irow)].value = 'NA'
else:
# Extract data from JSON
sheet['B' + str(irow)].value = data['results'][0]['LONGITUDE']
sheet['C' + str(irow)].value = data['results'][0]['LATITUDE']
wb.save('Street_Name_List.xlsx')
print('Job Done.')
sheet['B1'] = 'LONGITUDE'
sheet['C1'] = 'LATITUDE'
for irow in range(2, sheet.max_row+1):
street_name = sheet['A' + str(irow)].value
query_string = 'https://developers.onemap.sg/commonapi/search?searchVal=' + str(
street_name) + '&returnGeom=Y&getAddrDetails=N&pageNum=1'
result = requests.get(query_string)
# Convert JSON into Python Object
data = json.loads(result.content)
if data['results'] == []: ## check if the longitude and latitude can be found, if not, update to NA
sheet['B' + str(irow)].value = 'NA'
sheet['C' + str(irow)].value = 'NA'
else:
# Extract data from JSON
sheet['B' + str(irow)].value = data['results'][0]['LONGITUDE']
sheet['C' + str(irow)].value = data['results'][0]['LATITUDE']
wb.save('Street_Name_List.xlsx')
print('Job Done.')
sheet2 = wb.active
print(sheet2.max_row)
sheet2['B1'] = 'LONGITUDE'
sheet2['C1'] = 'LATITUDE'
for irow in range(2, 100):
mrt_station_english = sheet2['A' + str(irow)].value
query_string = 'https://developers.onemap.sg/commonapi/search?searchVal=' + str(
mrt_station_english) + '&returnGeom=Y&getAddrDetails=N&pageNum=1'
result = requests.get(query_string)
# Convert JSON into Python Object
data = json.loads(result.content)
# Extract data from JSON
sheet2['B' + str(irow)].value = data['results'][0]['LONGITUDE']
sheet2['C' + str(irow)] = data['results'][0]['LATITUDE']
wb.save('mrt_List_withlong.xlsx')
print('Job Done.')
#Trying to retrieve the data of one station (Clementi)
query_string_clementiMRTstation='https://developers.onemap.sg/commonapi/search?searchVal=' + "CLEMENTI " + '&returnGeom=Y&getAddrDetails=Y&pageNum=1'
result_clementi= requests.get(query_string_clementiMRTstation)
# Convert JSON into Python Object
data2 = json.loads(result_clementi.content)
print(data2)
#I have generated a list of street names from the street_name column of the HDBresales dataset
#By using the street names as inputs for the onemap.sg Geocode API, I could retrieve the information on the longitude(long) and latitude(lat) of the street.
#Street_Name_List(5).csv dataset has already all the retrieved information of the longitude and Latitude coordinates of the street as indicated below.
coordinate_street=pd.read_csv("Street_Name_List (5).csv")
coordinate_street.head()
#drop the rows with missing information on the longitude and latitude
coordinate_street.head()
coordinate_street=coordinate_street.dropna(axis = 0, how ='any')
#Now we have both the lat and long of our dataset of streetname, we shall make a list to generate the distance from the nearest mrt
list_of_lat = coordinate_street['LATITUDE']
list_of_long = coordinate_street['LONGITUDE']
# List if coordinates of all unique addresses in our dataset
list_of_coordinates = []
for lat, long in zip(list_of_lat, list_of_long):
list_of_coordinates.append((lat,long))
len(list_of_coordinates)
#I have made a list of all the MRT station names from the webpage:https://data.gov.sg/dataset/train-station-chinese-names
#By using the MRT station names as inputs for the onemap.sg Geocode API, I could retrieve the information on the longitude(long) and latitude(lat) of the MRT station.
#Street_Name_List(5).csv dataset has already all the retrieved information of the longitude and Latitude coordinates of the street as indicated below.
mrt_location=pd.read_csv("MRT_Log_lat_List.csv")
#drop the rows with missing information on the longitude and latitude
mrt_location=mrt_location.dropna(axis = 0, how ='any')
mrt_location.head()
# List of MRT Coordinates in Singapore
mrt_lat = mrt_location['LATITUDE']
mrt_long = mrt_location['LONGITUDE']
list_of_mrt_coordinates = []
for lat, long in zip(mrt_lat, mrt_long):
list_of_mrt_coordinates.append((lat, long))
len(list_of_mrt_coordinates)
# Calculting the distance to nearest MRT
from geopy.distance import geodesic
list_of_dist_mrt = []
min_dist_mrt = []
for origin in list_of_coordinates:
for destination in range(0, len(list_of_mrt_coordinates)):
list_of_dist_mrt.append(geodesic(origin,list_of_mrt_coordinates[destination]).meters)
shortest = (min(list_of_dist_mrt))
min_dist_mrt.append(shortest)
list_of_dist_mrt.clear()
coordinate_street['min_dist_mrt'] = min_dist_mrt
print(coordinate_street.head())
#Retrieving the dataset on HDB resales price pooled from the original data files which contains the resale-flat-prices dated from 1990 to 2020
resaleHDB_df=pd.read_csv("pooledHDB.csv")
print(resaleHDB_df.head())
#Merging the pooled HDB resale, longitude and latitude datasets of street address and min_dist_mrt together
combined_df=resaleHDB_df.merge(coordinate_street, how='inner', left_on='street_name', right_on='street_name')
#Generate a new column for resale price per unit area
combined_df['price_per_sqm'] = combined_df['resale_price'].div(combined_df['floor_area_sqm'])
#Generate a new column to calculate the distance from CBD using the coordinates of Raffles Place MRT Station: 103.8514631, 1.283933262
#Formulate to calculate distance from CBD
#Difference in Latitude=(latitude1-latitude2)*110.574
#Difference in Longitude=(Longitude1-longitude2)*111.32
#Distance between 2 locations=(Distance in Latitude^2+Distance in Longitude^)^0.5
dist_from_cbd = [ ((((row.LONGITUDE-103.8514631)*111.32)**2) + ((row.LATITUDE-1.283933262)*110.574)**2)**0.5 for index, row in combined_df.iterrows() ]
combined_df['dist_from_cbd']=dist_from_cbd
combined_df.head(10)
#drop columns with missing values
clean_df=combined_df.drop(['remaining_lease','average_storey'],axis=1)
clean_df
#Since the detail of the exact floor of the resale house is not provided in the dataset, utilise the average of the range
clean_df['floorlevel']=clean_df[['storey_to','storey_from']].mean(axis=1)
clean_df.to_csv('finalsecondcleanHDBdatadraft2.csv',index=False)
# Plotting Scatterplot to examine the relationship between HDB price and distance from mrt
fig, ax1 = plt.subplots(1,1, figsize=(20,12))
g = sns.scatterplot(x=clean_df['min_dist_mrt'],y=clean_df['price_per_sqm'], hue=clean_df['flat_type'], ax=ax1)
box = g.get_position()
g.set_position([box.x0, box.y0, box.width * 0.85, box.height]) # resize position
g.legend(loc='center right', bbox_to_anchor=(1.2, 0.5), ncol=1)
g.set_xlabel('Distance_from_MRT(m)',fontsize=20)
g.set_ylabel('Price_per_sqm(SGD$)',fontsize=20)
g.set(xlim=(0,2000))
plt.show()
# Plotting Scatterplot to examine the relationship between HDB price and floorlevel
fig, ax1 = plt.subplots(1,1, figsize=(20,12))
g = sns.scatterplot(x=clean_df['floorlevel'],y=clean_df['price_per_sqm'], hue=clean_df['flat_type'], ax=ax1)
box = g.get_position()
g.set_position([box.x0, box.y0, box.width * 0.85, box.height]) # resize position
g.legend(loc='center right', bbox_to_anchor=(1.2, 0.5), ncol=1)
g.set_xlabel('Floor_level',fontsize=20)
g.set_ylabel('Price_per_sqm(SGD$)',fontsize=20)
plt.show()
# Plotting Scatterplot to examine the relationship between HDB price and remaining_lease
fig, ax1 = plt.subplots(1,1, figsize=(20,12))
g = sns.scatterplot(x=clean_df['recalculate_remaining_lease'],y=clean_df['price_per_sqm'], hue=clean_df['flat_type'], ax=ax1)
box = g.get_position()
g.set_position([box.x0, box.y0, box.width * 0.85, box.height]) # resize position
g.legend(loc='center right', bbox_to_anchor=(1.2, 0.5), ncol=1)
g.set_xlabel('Years of Remaining Lease',fontsize=20)
g.set_ylabel('Price_per_sqm(SGD$)',fontsize=20)
plt.show()
# Plotting Scatterplot to examine the relationship between flatsize and resale_price
fig, ax1 = plt.subplots(1,1, figsize=(20,12))
g = sns.scatterplot(x=clean_df['floor_area_sqm'],y=clean_df['resale_price'], ax=ax1)
box = g.get_position()
g.set_position([box.x0, box.y0, box.width * 0.40, box.height*0.40]) # resize position
g.set_xlabel('Flat size(sqm)',fontsize=20)
g.set_ylabel('Resale price(SGD$)',fontsize=20)
plt.show()
# Plotting Scatterplot to examine the relationship between HDB price and distance from CBD
fig, ax1 = plt.subplots(1,1, figsize=(20,12))
g = sns.scatterplot(x=clean_df['dist_from_cbd'],y=clean_df['price_per_sqm'], hue=clean_df['flat_type'], ax=ax1)
box = g.get_position()
g.set_position([box.x0, box.y0, box.width * 0.85, box.height]) # resize position
g.legend(loc='center right', bbox_to_anchor=(1.2, 0.5), ncol=1)
g.set(xlim=(0,22))
g.set_xlabel('Distance from CBD(km)',fontsize=20)
g.set_ylabel('Resale price(SGD$)',fontsize=20)
plt.show()
# Import the libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
#We want to build the dataframe to be used for the modelling, features to be used for HDB resale price estimations: distance from mrt,flat size,floorlevel,distance from CBD
hedonic_model_data=clean_df[['resale_price','recalculate_remaining_lease','floor_area_sqm','min_dist_mrt','dist_from_cbd','floorlevel']]
hedonic_model_data
#resale_price before transformation
#histogram
sns.distplot(hedonic_model_data['resale_price'], bins=50, kde=False)
plt.show()
#perform data transformation of resale_price
from scipy.stats import norm
from scipy import stats
#histogram and normal probability plot
sns.distplot(hedonic_model_data['resale_price'], fit=norm, bins=50, kde=False);
fig = plt.figure()
res = stats.probplot(hedonic_model_data['resale_price'], plot=plt)
hedonic_model_data['resale_price']=np.log(hedonic_model_data['resale_price'])
#transformed histogram and normal probability plot
sns.distplot(hedonic_model_data['resale_price'], fit=norm, bins=50, kde=False);
fig = plt.figure()
res = stats.probplot(hedonic_model_data['resale_price'], plot=plt)
sns.distplot(hedonic_model_data['recalculate_remaining_lease'], fit=norm, bins=50, kde=False);
fig = plt.figure()
res = stats.probplot(hedonic_model_data['recalculate_remaining_lease'], plot=plt)
#data transformation (normalisaton of remaining_lease)
hedonic_model_data['recalculate_remaining_lease'] = np.log(hedonic_model_data['recalculate_remaining_lease'])
#transformed histogram and normal probability plot
sns.distplot(hedonic_model_data['recalculate_remaining_lease'], fit=norm, bins=50, kde=False);
fig = plt.figure()
res = stats.probplot(hedonic_model_data['recalculate_remaining_lease'], plot=plt)
#transformed histogram and normal probability plot
sns.distplot(hedonic_model_data['floor_area_sqm'], fit=norm, bins=50, kde=False);
fig = plt.figure()
res = stats.probplot(hedonic_model_data['floor_area_sqm'], plot=plt)
#data transformation (normalisaton of floor_area_sqm)
hedonic_model_data['floor_area_sqm']= np.log(hedonic_model_data['floor_area_sqm'])
#transformed histogram and normal probability plot
sns.distplot(hedonic_model_data['floor_area_sqm'], fit=norm, bins=50, kde=False);
fig = plt.figure()
res = stats.probplot(hedonic_model_data['floor_area_sqm'], plot=plt)
#data transformation (normalisaton of min_dist_mrt )
hedonic_model_data['min_dist_mrt']= np.log(hedonic_model_data['min_dist_mrt'])
#transformed histogram and normal probability plot
sns.distplot(hedonic_model_data['min_dist_mrt'], fit=norm, bins=50, kde=False);
fig = plt.figure()
res = stats.probplot(hedonic_model_data['min_dist_mrt'], plot=plt)
#data transformation ()
hedonic_model_data['dist_from_cbd']= np.log(hedonic_model_data['dist_from_cbd'])
#transformed histogram and normal probability plot
sns.distplot(hedonic_model_data['dist_from_cbd'], fit=norm, bins=50, kde=False);
fig = plt.figure()
res = stats.probplot(hedonic_model_data['dist_from_cbd'], plot=plt)
#Our target variable is resale price
#The rest are our non-dependent variable
y = hedonic_model_data.iloc[:,0].values
X = hedonic_model_data.iloc[:,1:6].values
print(y)
y=y.reshape((-1, 1))
print(y)
print(X)
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Fitting Multiple Linear Regression to the Training set
regression = LinearRegression()
regression.fit(X_train, y_train)
X_test
# Predicting the Test set results
y_pred = regression.predict(X_test)
df = pd.DataFrame(data=y_test, columns=['y_test'])
df['y_pred'] = y_pred
df
pred_resaleprice=np.exp(y_pred)
original_resaleprice=np.exp(y_test)
df2 = pd.DataFrame(data=original_resaleprice, columns=['original_resaleprice'])
df2['pred_resaleprice(LinearRegression)']=pred_resaleprice
df2.head(10)
# Since all values are numeric, do a correction and sort to determine the most important features relative to HDB resale prices
corr = hedonic_model_data.corr()
corr.sort_values(["resale_price"], ascending = False, inplace = True)
print(corr["resale_price"])
from sklearn.metrics import r2_score
yhat_train = regression.predict(X_train)
yhat_test = regression.predict(X_test)
print("R2 Score for train_dataset(LinearRegression)={}".format(r2_score(y_train,yhat_train))) # R2 score, 1.0 is best
print("R2 Score for test_dataset(LinearRegression)={}".format(r2_score(y_test,yhat_test)))
accuracy = regression.score(X_train, y_train)
"Accuracy for linear regression algorithm for train dataset is: {}%".format(int(round(accuracy * 100)))
accuracy = regression.score(X_test, y_test)
"Accuracy for linear regression algorithm for test dataset is: {}%".format(int(round(accuracy * 100)))
# Plot predictions between train and test data set
plt.scatter(yhat_train ,y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(yhat_test,y_test, c = "red", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([11.5, 15.5], [11.5, 15.5], c = "red")
plt.show()
from sklearn.metrics import mean_squared_error
print("RMSE for train_set(LinearRegression) is: {}".format(mean_squared_error(y_train, yhat_train)))
from sklearn.metrics import mean_squared_error
print("RMSE for test_set(LinearRegression) is: {}".format(mean_squared_error(y_test, yhat_test)))
from sklearn import ensemble
GB_default = ensemble.GradientBoostingRegressor().fit(X_train, y_train)
from sklearn.metrics import r2_score
y_GB_train = GB_default.predict(X_train)
y_GB_test = GB_default.predict(X_test)
print("R2 Score for train_dataset(GBregression)={}".format(r2_score(y_train,y_GB_train))) # R2 score, 1.0 is best
print("R2 Score for test_dataset(GBregression)={}".format(r2_score(y_test,y_GB_test)))
# Average R2 score and standard deviation of 5-fold cross-validation
scores = cross_val_score(GB_default, X_test, y_test, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# Average R2 score and standard deviation of 5-fold cross-validation
scores = cross_val_score(GB_default, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
accuracy = GB_default.score(X_train, y_train)
"Accuracy for train_set(GBregression) is: {}%".format(int(round(accuracy * 100)))
accuracy = GB_default.score(X_test, y_test)
"Accuracy for test_set(GBregression) is: {}%".format(int(round(accuracy * 100)))
from sklearn.metrics import mean_squared_error
print("RMSE for train_set(GBregression) is: {}".format(mean_squared_error(y_train, y_GB_train)))
from sklearn.metrics import mean_squared_error
print("RMSE for test_set(GBregression) is: {}".format(mean_squared_error(y_test, y_GB_test)))
# Plot predictions between train and test data set
plt.scatter(y_GB_train , y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_GB_test, y_test, c = "red", marker = "s", label = "Validation data")
plt.title("Gradient Boosting Regressor")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([11.5, 15.5], [11.5, 15.5], c = "red")
plt.show()
pred_resaleprice_GB=np.exp(y_GB_test)
original_resaleprice=np.exp(y_test)
GB_df = pd.DataFrame(data=original_resaleprice, columns=['original_resaleprice'])
GB_df['pred_resaleprice(GB)']=pred_resaleprice_GB
GB_df.head(10)
#Examining the feature importance being analysed by the GB model
import pandas as pd
pd.concat((pd.DataFrame(hedonic_model_data.iloc[:, 1:].columns, columns = ['variable']),
pd.DataFrame(GB_default.feature_importances_, columns = ['importance'])),
axis = 1).sort_values(by='importance', ascending = False)[:20]
Gradient Boosting Regression Model is the best model for this data set because it has: Highest accuracy scores Highest R2 scores Lowest RMSE scores
Floor_area_sqm and number of years of remaining lease are important features in the prediction of the resale HDB flat price.