Shailesh020902@gmail - Com 1
Shailesh020902@gmail - Com 1
30 64 1 1.1
0 30 62 3 1
1 30 65 0 1
2 31 59 2 1
3 31 65 4 1
4 33 58 10 1
5 33 60 0 1
6 34 59 0 2
7 34 66 9 2
8 34 58 30 1
9 34 60 1 1
10 34 61 10 1
11 34 67 7 1
12 34 60 0 1
13 35 64 13 1
14 35 63 0 1
15 36 60 1 1
16 36 69 0 1
17 37 60 0 1
18 37 63 0 1
19 37 58 0 1
20 37 59 6 1
21 37 60 15 1
22 37 63 0 1
23 38 69 21 2
24 38 59 2 1
25 38 60 0 1
26 38 60 0 1
27 38 62 3 1
28 38 64 1 1
29 38 66 0 1
.. .. .. .. ...
275 67 66 0 1
276 67 61 0 1
277 67 65 0 1
278 68 67 0 1
279 68 68 0 1
280 69 67 8 2
281 69 60 0 1
282 69 65 0 1
283 69 66 0 1
284 70 58 0 2
285 70 58 4 2
286 70 66 14 1
287 70 67 0 1
288 70 68 0 1
289 70 59 8 1
290 70 63 0 1
291 71 68 2 1
292 72 63 0 2
293 72 58 0 1
294 72 64 0 1
295 72 67 3 1
296 73 62 0 1
297 73 68 0 1
298 74 65 3 2
299 74 63 0 1
300 75 62 1 1
301 76 67 0 1
302 77 65 3 1
303 78 65 1 2
304 83 58 2 2
In [2]: # As the column names are not that helpful we rename them as per the names mentioned in the kaggle d
escription to make more sense out of column names
data.columns=['Age','Op_year','axil_nodes','Surv_status']
print(pd.isnull(data).sum())
#no null values are present in the given dataset
print("\n classes:",data.Surv_status.groupby(data.Surv_status).count())
#this is an imbalanced dataset
Age 0
Op_year 0
axil_nodes 0
Surv_status 0
dtype: int64
classes: Surv_status
1 224
2 81
Name: Surv_status, dtype: int64
In [3]: #survival status within 5 years of operation represented by 1 in dataset shows most people between
30-40 survived
sns.boxplot(x="Surv_status",y="Age",data=data)
plt.show()
In [4]: #There seems to be a higher distribution at axil nodes 0 for both survival status.comparatively high
er for survival status 1 same can be done with the help of box-plots
sns.FacetGrid(data,col='Surv_status', hue='Surv_status').map(plt.scatter,"Age","axil_nodes").add_leg
end()
plt.show()
In [5]: #we cannot deduce anything substantial from the distribution just that most of the data lies between
0-10 axil nodes
#across all the op_year for both the survivalstatus
sns.FacetGrid(data,col='Surv_status',col_wrap=2).map(plt.scatter,"Op_year","axil_nodes").add_legend
()
plt.show()
In [92]: np.random.seed(1234)
#z score k probability table and .cdf se same hi answer aata h
#print(data.Age.loc[data.Age==67],data.Age[data.Age==60])
m=data.Age.mean()
med=data.Age.median()
d=data.Age.std()
print(m,d,med)
print("cdf: ",norm(52.53,10.74).cdf(67))
print("pdf: ",norm(52.53,10.74).pdf(67))
print("calclated z score:",(data.Age[272]-52.53)/10.74)
print("stats z score:",data.zscore[272])
print(data.Age.loc[302],data.Age.loc[222])
data['zscore']=zscore(data.Age)
print("Zscore for age 77 and 60 is:",data.zscore.loc[302],data.zscore.loc[222])
print("value for z score diffrence :",0.9890-0.7549)
print("infernece that probability of observing an age between 60 and 77 is 23.40% ")
y=round(data.zscore)
data['y']=y
print(data.groupby(['y']).count())
In [61]: #univariate analysis to show the age vector is distributed which is nearly normal with mean and medi
an as 52
#x=data.Age
x=sample_data.Age
sns.distplot(x,kde=True,fit=norm);
plt.show()
x.mean()
Out[61]: 52.9875
x.mode()
Out[8]: 0 58
dtype: int64
In [9]: # the axil nodes are highy-right skewed with mean 4 which indicates most of the axil nodes in our da
taset is between 0 and 4
#the diffrence between mean and median rises due to skewness
x=data.axil_nodes
sns.distplot(x);
plt.show()
print("mean:",x.mean())
print("median:",x.median())
mean: 4.036065573770492
median: 1.0
In [10]: #CDF of Age and vertical lines at Age 70 indicates its almost 90% probable to find a value eaqual o
r less than 70
sns.kdeplot(data.Age, cumulative=True)
plt.axvline(x=70)
plt.show()
In [11]: #CDF of operation_years and vertical lines at year 68 indicates its almost 90% probable to find a v
alue eaqual or less than 68
sns.kdeplot(data.Op_year, cumulative=True)
plt.axvline(x=68)
plt.show()
plt.show()
In [12]: #there seems to be a very less coorelation between surv_status with all other vectors
print("Pearson corelation matrix:\n",data.corr(method='pearson'))