Boston House Prediction - Colab1
Boston House Prediction - Colab1
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KA # TO THE CORRECT LOCATION
(/kaggle/input) IN YOUR NOT # THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE # ENVIRONMENT SO THERE MAY
BE MISSING LIBRARIES USED # NOTEBOOK.
CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'boston-house-price-prediction:
KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working' KAGGLE_SYMLINK='kaggle'
try:
os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'i except FileExistsError:
pass try:
os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", except FileExistsError:
pass
Avinash Shukla (27)
for data_source_mapping in DATA_SOURCE_MAPPING.split( directory, download_url_encoded =
data_source_map download_url = unquote(download_url_encoded)
filename = urlparse(download_url).path
destination_path = os.path.join(KAGGLE_INPUT_PATH try:
with urlopen(download_url) as fileres, NamedT total_length = fileres.headers['content-l
print(f'Downloading {directory}, {total_l dl = 0
data = fileres.read(CHUNK_SIZE) while len(data) > 0:
dl += len(data) tfile.write(data)
done = int(50 * dl / int(total_length sys.stdout.write(f"\r[{'=' * done}{'
sys.stdout.flush()
data = fileres.read(CHUNK_SIZE) if filename.endswith('.zip'):
with ZipFile(tfile) as zfile:
zfile.extractall(destination_path) else:
with tarfile.open(tfile.name) as tarfil tarfile.extractall(destination_path)
print(f'\nDownloaded and uncompressed: {d except HTTPError as e:
print(f'Failed to load (likely expired) {down continue
except OSError as e:
print(f'Failed to load {download_url} to path continue
#Displaying the first few rows of the dataframe print("First 5 rows of the dataset:")
print(data.head())
First 5 rows of the dataset:
b lstat medv
0 396.90 4.98 24.0
1 396.90 9.14 21.6
2 392.83 4.03 34.7
3 394.63 2.94 33.4
4 396.90 5.33 36.2
# Checking for any missing values in the dataset print("\nMissing values in the dataset:")
print(data.isnull().sum())
crim 0
zn 0
indus 0
chas 0
nox 0
rm 5
age 0
dis 0
rad 0
tax 0
ptratio 0
b 0
lstat 0
medv 0
dtype: int64
Summary Statistics:
crim zn indus chas nox rm \
count 506.00000 506.00000 506.00000 506.00000 506.00000 501.000000
0 0 0 0 0
mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284341
std 8.601545 23.322453 6.860353 0.253994 0.115878 0.705587
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.884000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208000
75% 3.677083 12.500000 18.100000 0.000000 0.624000 6.625000
max 88.976200 100.00000 27.740000 1.000000 0.871000 8.780000
0
Avinash Shukla (27)
age dis rad tax ptratio b \
lstat med
count 506.00000 v
0 506.00000
0
mean 12.653063 22.532806
std 7.141062 9.197104
min 1.730000 5.000000
25% 6.950000 17.025000
50% 11.360000 21.200000
75% 16.955000 25.000000
max 37.970000 50.000000
C C
plt.subplot(2, 2, 2)
sns.histplot(data['rm'], kde=True)
plt.title('Average Number of Rooms Distribution')
Avinash Shukla (27)
plt.subplot(2, 2, 3)
sns.histplot(data['lstat'], kde=True)
plt.title('Lower Status Population (%) Distribution')
plt.subplot(2, 2, 4)
sns.histplot(data['medv'], kde=True)
plt.title('Median Home Value Distribution')
plt.tight_layout() plt.show()
Question No.02
Avinash Shukla (27)
# Calculate the correlation matrix correlation_matrix = data.corr()
C C
# Identify the features with the highest positive and # Assume 'medv' is the target variable (median
home v target_variable = 'medv'
correlation_with_target = correlation_matrix[target_v
# Display the features with highest positive and nega print("\nFeatures with highest positive correlation
w print(correlation_with_target[correlation_with_target
# Fit the scaler on the training data and transform b X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Question No.04
C C
Question No.05
# Predict the house prices using the testing data y_pred = model.predict(X_test_scaled)
print(f"Mean Absolute Error (MAE): {mae}") print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
Mean Absolute Error (MAE): 3.2064039639003856 Mean Squared Error (MSE): 24.40482518814648
Root Mean Squared Error (RMSE): 4.940124005341008