House Price Prediction: # Importing Necessary Libraries
House Price Prediction: # Importing Necessary Libraries
Data Exploration
In [1]: # importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
!pip install lazypredict
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
Out[3]: price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating aircondit
Out[4]: price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating aircond
price 0
Out[5]:
area 0
bedrooms 0
bathrooms 0
stories 0
mainroad 0
guestroom 0
basement 0
hotwaterheating 0
airconditioning 0
parking 0
prefarea 0
furnishingstatus 0
dtype: int64
False 545
Out[6]:
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 price 545 non-null int64
1 area 545 non-null int64
2 bedrooms 545 non-null int64
3 bathrooms 545 non-null int64
4 stories 545 non-null int64
5 mainroad 545 non-null object
6 guestroom 545 non-null object
7 basement 545 non-null object
8 hotwaterheating 545 non-null object
9 airconditioning 545 non-null object
10 parking 545 non-null int64
11 prefarea 545 non-null object
12 furnishingstatus 545 non-null object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
price 219
Out[9]:
area 284
bedrooms 6
bathrooms 4
stories 4
mainroad 2
guestroom 2
basement 2
hotwaterheating 2
airconditioning 2
parking 4
prefarea 2
furnishingstatus 3
dtype: int64
Data Visualization
In [11]: # Visualizing 'price'
plt.hist(df['price'], color='r')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Distribution of Prices')
plt.show()
Out[26]: price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating aircon
0 13300000 7420 4 2 3 1 0 0 0
1 12250000 8960 4 4 4 1 0 0 0
2 12250000 9960 3 2 2 1 0 1 0
3 12215000 7500 4 2 2 1 0 1 0
4 11410000 7420 4 1 2 1 1 1 0
... ... ... ... ... ... ... ... ... ...
540 1820000 3000 2 1 1 1 0 1 0
In [29]: print(models)
Time Taken
Model
GradientBoostingRegressor 0.19
PoissonRegressor 0.02
LassoLarsCV 0.03
LassoLarsIC 0.02
LarsCV 0.05
Lars 0.09
TransformedTargetRegressor 0.01
LinearRegression 0.01
Lasso 0.01
LassoLars 0.01
Ridge 0.01
SGDRegressor 0.01
LassoCV 0.08
RidgeCV 0.01
HistGradientBoostingRegressor 0.25
BaggingRegressor 0.05
XGBRegressor 0.13
LGBMRegressor 0.32
ExtraTreesRegressor 0.25
RandomForestRegressor 0.32
ElasticNet 0.02
HuberRegressor 0.02
KNeighborsRegressor 0.01
OrthogonalMatchingPursuitCV 0.02
AdaBoostRegressor 0.13
TweedieRegressor 0.02
GammaRegressor 0.02
RANSACRegressor 0.21
DecisionTreeRegressor 0.01
ExtraTreeRegressor 0.01
OrthogonalMatchingPursuit 0.03
ElasticNetCV 0.08
BayesianRidge 0.02
DummyRegressor 0.01
NuSVR 0.08
QuantileRegressor 9.24
SVR 0.02
KernelRidge 0.12
PassiveAggressiveRegressor 0.05
LinearSVR 0.01
MLPRegressor 1.87
GaussianProcessRegressor 0.13
In [30]: predictions
Model
Model Evaluation
In [32]: # Evaluate the model using metrics
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
train_mae = mean_absolute_error(y_train, train_predictions)
test_rmse = mean_squared_error(y_test, test_predictions, squared=False)
test_mae = mean_absolute_error(y_test, test_predictions)
In [33]: # Visualize the predicted values vs. actual values for the training set
plt.scatter(y_train, train_predictions, color='violet', alpha=0.5)
plt.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], color='cyan', lines
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Training Set - Actual vs. Predicted Price')
plt.show()
# Visualize the predicted values vs. actual values for the testing set
plt.scatter(y_test, test_predictions, color='violet', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='cyan', linestyle
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Testing Set - Actual vs. Predicted Price')
plt.show()
Model Interpretation
In [34]: importances = model.feature_importances_
feature_names = X_train.columns
# Sort the feature importances in descending order
sorted_indices = importances.argsort()[::-1]
sorted_importances = importances[sorted_indices]
sorted_features = feature_names[sorted_indices]