Dsbda 5
Dsbda 5
[3]: df=pd.read_csv("/home/bcl07/heart_disease.csv")
[4]: df
[4]: age sex cp trestbps chol fbs restecg thalach exang oldpeak \
0 52 1 0 125 212 0 1 168 0 1.0
1 53 1 0 140 203 1 0 155 1 3.1
2 70 1 0 145 174 0 1 125 1 2.6
3 61 1 0 148 203 0 1 161 0 0.0
4 62 0 0 138 294 1 1 106 0 1.9
… … … .. … … … … … … …
1020 59 1 1 140 221 0 1 164 1 0.0
1021 60 1 0 125 258 0 0 141 1 2.8
1022 47 1 0 110 275 0 0 118 1 1.0
1023 50 0 0 110 254 0 0 159 0 0.0
1024 54 1 0 120 188 0 1 113 0 1.4
[5]: df.columns
1
[5]: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
dtype='object')
[6]: df.isnull().sum()
[6]: age 0
sex 0
cp 0
trestbps 0
chol 0
fbs 0
restecg 0
thalach 0
exang 0
oldpeak 0
slope 0
ca 0
thal 0
target 0
dtype: int64
[8]: df=df.drop_duplicates()
[9]: df.describe()
thal target
2
count 302.000000 302.000000
mean 2.314570 0.543046
std 0.613026 0.498970
min 0.000000 0.000000
25% 2.000000 0.000000
50% 2.000000 1.000000
75% 3.000000 1.000000
max 3.000000 1.000000
[10]: df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 302 entries, 0 to 878
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 302 non-null int64
1 sex 302 non-null int64
2 cp 302 non-null int64
3 trestbps 302 non-null int64
4 chol 302 non-null int64
5 fbs 302 non-null int64
6 restecg 302 non-null int64
7 thalach 302 non-null int64
8 exang 302 non-null int64
9 oldpeak 302 non-null float64
10 slope 302 non-null int64
11 ca 302 non-null int64
12 thal 302 non-null int64
13 target 302 non-null int64
dtypes: float64(1), int64(13)
memory usage: 35.4 KB
[11]: df.isna().sum()
[11]: age 0
sex 0
cp 0
trestbps 0
chol 0
fbs 0
restecg 0
thalach 0
exang 0
oldpeak 0
slope 0
ca 0
3
thal 0
target 0
dtype: int64
[12]: df.head()
[12]: age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \
0 52 1 0 125 212 0 1 168 0 1.0 2
1 53 1 0 140 203 1 0 155 1 3.1 0
2 70 1 0 145 174 0 1 125 1 2.6 0
3 61 1 0 148 203 0 1 161 0 0.0 2
4 62 0 0 138 294 1 1 106 0 1.9 1
ca thal target
0 2 3 0
1 0 3 0
2 0 3 0
3 1 3 0
4 3 2 0
[13]: df.fbs.unique()
[20]: df.columns
4
IQR = Q3 - Q1
threshold = 1.5 * IQR
outlier_mask = (column < Q1 - threshold) | (column > Q3 + threshold)
return column[~outlier_mask]
/tmp/ipykernel_10564/1228815343.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
5
See the caveats in the documentation: https://pandas.pydata.org/pandas-
docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df[col] = remove_outliers(df[col])
6
7
8
9
[30]: df = df.dropna()
[31]: df.isna().sum()
[31]: age 0
sex 0
cp 0
trestbps 0
chol 0
fbs 0
restecg 0
thalach 0
exang 0
oldpeak 0
slope 0
ca 0
thal 0
target 0
dtype: int64
[32]: df = df.drop('fbs',axis=1)
# Print correlations
print("Correlation with the Target:")
print(correlations)
print()
10
slope 0.327420
ca -0.459629
thal -0.389514
Name: target, dtype: float64
[37]: x = df[['cp','thal','exang','oldpeak','slope','ca']]
y = df.target
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
x_train.shape,x_test.shape,y_train.shape,y_test.shape
11
[40]: x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
[44]: y_train.shape
[44]: (219, 1)
Accuracy: 0.8181818181818182
/home/bcl07/.local/lib/python3.8/site-packages/sklearn/utils/validation.py:1183:
DataConversionWarning: A column-vector y was passed when a 1d array was
expected. Please change the shape of y to (n_samples, ), for example using
ravel().
y = column_or_1d(y, warn=True)
[ ]:
12