Cyber Security Breaches Coding
Cyber Security Breaches Coding
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
Dataset 1
In [2]:
df = pd.read_csv("/kaggle/input/data-breaches-a-comprehensive-list/df_1.csv")
In [3]:
df.head()
Out[3]:
poor
2 2 Accendo Insurance Co. 2020 175350 healthcare [8][9]
security
15200000
3 3 Adobe Systems Incorporated 2013 tech hacked [10]
0
poor
4 4 Adobe Inc. 2019 7500000 tech [11][12]
security
In [4]:
df.shape
Out[4]:
(352, 7)
In [5]:
df.dtypes
Out[5]:
Unnamed: 0 int64
Entity object
Year object
Records object
Organization type object
Method object
Sources object
dtype: object
In [6]:
df.drop(['Sources'], axis=1, inplace=True)
In [7]:
df.isnull().any()
Out[7]:
Unnamed: 0 False
Entity False
Year False
Records True
Organization type False
Method True
dtype: bool
In [8]:
df.isnull().sum()
Out[8]:
Unnamed: 0 0
Entity 0
Year 0
Records 2
Organization type 0
Method 1
dtype: int64
In [9]:
df.columns = ['id', 'Entity', 'Year', 'Records', 'Organization type',
'Method']
In [10]:
df.head(10)
Out[10]:
15200000
3 3 Adobe Systems Incorporated 2013 tech hacked
0
7 7 Affinity Health Plan, Inc. 2013 344579 healthcare lost / stolen media
32000000
8 8 Airtel 2019 telecommunications poor security
0
In [11]:
table_year_df = df['Year'].value_counts()
table_year_df
Out[11]:
2011 34
2020 31
2019 30
2015 28
2013 28
2018 26
2014 25
2012 23
2016 22
2010 19
2008 16
2021 13
2009 13
2007 12
2017 9
2006 7
2005 6
2022 5
2004 2
2019-2020 1
2018-2019 1
2014 and 2015 1
Name: Year, dtype: int64
In [12]:
df['Year'] = df['Year'].astype(str)
df['Year'] = df['Year'].str[:4]
df['Year'] = df['Year'].astype(int)
In [13]:
df.dtypes
Out[13]:
id int64
Entity object
Year int64
Records object
Organization type object
Method object
dtype: object
In [14]:
table_year_df = df['Year'].value_counts()
table_year_df
Out[14]:
2011 34
2019 31
2020 31
2015 28
2013 28
2018 27
2014 26
2012 23
2016 22
2010 19
2008 16
2021 13
2009 13
2007 12
2017 9
2006 7
2005 6
2022 5
2004 2
Name: Year, dtype: int64
In [15]:
sns.countplot(x='Year', data=df);
plt.title('Data Breaches pro Jahr')
plt.xticks(rotation=90);
In [16]:
sns.countplot(x='Year', data=df, order=table_year_df.index.values);
plt.title('Data Breaches pro Jahr in order')
plt.xticks(rotation=90);
In [17]:
table1 = df['Method'].value_counts()
table1
Out[17]:
hacked 192
poor security 43
lost / stolen media 33
accidentally published 21
inside job 19
lost / stolen computer 16
unknown 7
improper setting, hacked 2
poor security/inside job 2
intentionally lost 1
accidentally exposed 1
publicly accessible Amazon Web Services (AWS) server 1
hacked/misconfiguration 1
rogue contractor 1
ransomware hacked 1
misconfiguration/poor security 1
unprotected api 1
zero-day vulnerabilities 1
data exposed by misconfiguration 1
Poor security 1
poor security / hacked 1
accidentally uploaded 1
unsecured S3 bucket 1
inside job, hacked 1
social engineering 1
Name: Method, dtype: int64
In [18]:
sns.countplot(x='Method', data=df, order = table1.index.values);
plt.title('Method')
plt.xticks(rotation=90);
In [19]:
df_nothacked = df.loc[df['Method'] != 'hacked']
df_nothacked.head()
Out[19]:
In [20]:
sns.countplot(x='Method', data=df_nothacked, order =
df_nothacked['Method'].value_counts().index);
plt.title('Method')
plt.xticks(rotation=90);
In [21]:
table2 = df['Organization type'].value_counts()
table2.head(23)
Out[21]:
web 53
healthcare 47
financial 38
government 30
retail 27
tech 19
academic 13
telecoms 12
gaming 12
social network 8
hotel 8
transport 7
military 7
energy 4
restaurant 3
media 3
mobile carrier 2
social media 2
government, military 2
telecom 2
tech, retail 2
government, healthcare 2
telecommunications 2
Name: Organization type, dtype: int64
In [22]:
org_counts = df['Organization type'].value_counts().rename('org_counts')
df_org = df.merge(org_counts.to_frame(),
left_on='Organization type',
right_index=True)
In [23]:
org_counts.head()
Out[23]:
web 53
healthcare 47
financial 38
government 30
retail 27
Name: org_counts, dtype: int64
In [24]:
df_org.head()
Out[24]:
Organization
id Entity Year Records Method org_counts
type
lost / stolen
5 5 Advocate Medical Group 2017 4000000 healthcare 47
media
lost / stolen
7 7 Affinity Health Plan, Inc. 2013 344579 healthcare 47
media
Organization
id Entity Year Records Method org_counts
type
In [25]:
df_org_upper = df_org[df_org.org_counts > 2]
In [26]:
sns.countplot(x='Organization type', data=df_org_upper, order =
df_org_upper['Organization type'].value_counts().index);
plt.title('Data Breaches by Organisations')
plt.xticks(rotation=90);
In [27]:
sns.histplot(x='Organization type', stat='percent', data=df_org_upper);
plt.title('Data Breaches by Organisations')
plt.xticks(rotation=90);
In [28]:
df_cleaned_records = df.drop(df[df.Records == 'unknown'].index, inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'G20 world leaders'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'tens of thousands'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '19 years of data'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '63 stores'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'over 5,000,000'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'unknown (client list)'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'millions'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '235 GB'].index, inplace=True)
df_cleaned_records = df.drop(df[df.Records == '350 clients emails'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '9,000,000 (approx) - basic
booking, 2208 (credit card details)'].index, inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'Unknown'].index, inplace=True)
df_cleaned_records = df.drop(df[df.Records == '2.5GB'].index, inplace=True)
df_cleaned_records = df.drop(df[df.Records == '250 locations'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '500 locations'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '54 locations'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '51 locations'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '10 locations'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '8 locations'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '93 stores'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == '200 stores'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'undisclosed'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'Source Code
Compromised'].index, inplace=True)
df_cleaned_records = df.drop(df[df.Records == '100 terabytes'].index,
inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'TBC'].index, inplace=True)
df_cleaned_records = df.drop(df[df.Records == 'unknown'].index, inplace=True)
df_cleaned_records = df.dropna(subset=['Records'])
In [29]:
df_cleaned_records.shape
Out[29]:
(305, 6)
In [30]:
df_cleaned_records['Records'] = df_cleaned_records['Records'].astype(float)
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
0 2016 5.405824e+08
1 2020 1.251422e+09
Year Total Records
2 2013 3.469435e+09
3 2019 3.824901e+09
4 2017 2.547669e+08
5 2018 1.531850e+09
6 2005 4.682500e+07
7 2021 6.139627e+07
8 2015 2.016545e+08
9 2004 9.251000e+07
1
2006 7.126000e+07
0
1
2014 8.513410e+08
1
Year Total Records
1
2008 6.906650e+07
2
1
2010 1.598048e+07
3
1
2009 2.554680e+08
4
1
2011 2.277881e+08
5
1
2012 4.288396e+08
6
1
2007 1.532864e+08
7
1
2022 9.958922e+06
8
In [33]:
plt.figure(figsize=(10,5))
sns.lineplot(data=df_total_records, x='Year', y='Total Records')
plt.xticks([2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020, 2022]);
In [34]:
df_total_records_org = df_cleaned_records.groupby('Organization type',
sort=False)["Records"].sum().reset_index(name ='Total Records')
In [35]:
df_total_records_org = df_total_records_org.sort_values('Total Records',
ascending=False, ignore_index=True)
df_total_records_org_clean =
df_total_records_org.drop(df_total_records_org.index[21:])
In [36]:
df_total_records_org_clean
Out[36]:
0 web 5.203696e+09
2 tech 1.000898e+09
4 financial 8.185971e+08
5 hotel 5.055630e+08
6 telecommunications 4.200000e+08
7 retail 3.721407e+08
10 government 2.023509e+08
Organization type Total Records
12 gaming 1.726240e+08
13 healthcare 1.711426e+08
20 genealogy 9.228389e+07
In [37]:
sns.catplot(data=df_total_records_org_clean, x='Organization type', y='Total
Records')
plt.title('Total Records per Organisation Type')
plt.xticks(rotation=90);
In [38]:
df_total_records_org_clean =
df_total_records_org_clean.drop(df_total_records_org_clean.index[:1])
In [39]:
sns.catplot(data=df_total_records_org_clean, x='Organization type', y='Total
Records')
plt.title('Total Records per Organisation Type')
plt.xticks(rotation=90);
In [40]:
df_total_records_Method = df_cleaned_records.groupby('Method', sort=False)
["Records"].sum().reset_index(name ='Total Records')
In [41]:
df_total_records_Method = df_total_records_Method.sort_values('Total
Records', ascending=False, ignore_index=True)
df_total_records_Method_clean =
df_total_records_Method.drop(df_total_records_Method.index[21:])
df_total_records_Method_clean
Out[41]:
0 hacked 7.404780e+09
2 unknown 4.482339e+08
In [42]:
sns.catplot(data=df_total_records_Method_clean, x='Method', y='Total
Records')
plt.title('Total Records per Method')
plt.xticks(rotation=90);
In [43]:
Method_counts = df['Method'].value_counts().rename('Method_counts')
df_Method = df_total_records_Method_clean.merge(Method_counts.to_frame(),
left_on='Method',
right_index=True)
In [44]:
df_Method['relative'] = df_Method['Total Records']/df_Method['Method_counts']
In [45]:
df_Method = df_Method.sort_values('relative', ascending=False,
ignore_index=True)
In [46]:
df_Method
Out[46]:
1
publicly accessible Amazon Web Services (AWS) ... 3.800000e+07 1 3.800000e+07
0
1
accidentally published 2.699175e+08 19 1.420618e+07
1
1
improper setting, hacked 2.145775e+07 2 1.072888e+07
2
1
social engineering 6.054459e+06 1 6.054459e+06
3
Method Total Records Method_counts relative
1
lost / stolen media 1.704345e+08 32 5.326079e+06
4
1
inside job 7.642610e+07 18 4.245895e+06
5
1
lost / stolen computer 4.139767e+07 15 2.759844e+06
6
1
poor security/inside job 5.214200e+06 2 2.607100e+06
7
1
ransomware hacked 1.648922e+06 1 1.648922e+06
8
1
accidentally uploaded 1.500000e+06 1 1.500000e+06
9
2
intentionally lost 9.600000e+05 1 9.600000e+05
0
In [47]:
sns.catplot(data=df_Method, x='Method', y='relative')
plt.title('Total Records per Method')
plt.xticks(rotation=90);
In [48]:
df_heatmap = df.copy(deep=True)
In [49]:
le = LabelEncoder()
df_heatmap['Records'] = le.fit_transform(df_heatmap['Records'])
df_heatmap['Entity'] = le.fit_transform(df_heatmap['Entity'])
df_heatmap['Organization type'] = le.fit_transform(df_heatmap['Organization
type'])
df_heatmap['Method'] = le.fit_transform(df_heatmap['Method'])
In [50]:
plt.figure(figsize=(10,5))
sns.heatmap(df_heatmap[['Year', 'Records', 'Organization type',
'Method']].corr(), cmap='Spectral_r', annot=True);
Dataset 2
In [51]:
df2 = pd.read_csv("/kaggle/input/cyber-security-breaches-data/Cyber Security
Breaches.csv")
In [52]:
df2.head()
Out[52]:
Un N S
Name_o Date Type bre bre y
na u t Business_ Individ Location_of Date_Po Su
f_Cover _of_ _of_ ach ach e
me m a Associate uals_A _Breached_ sted_or_ mm
ed_Entit Brea Breac _sta _en a
d: be t _Involved ffected Information Updated ary
y ch h rt d r
0 r e
A
bin
der
cont
aini
Brooke ng 200 2
Army T 10/16 2014- the 9- Na 0
0 1 0 NaN 1000 Theft Paper
Medical X /2009 06-30 prot 10- N 0
Center ecte 16 9
d
heal
th
info
r...
Fiv
e
des
kto
p
Mid
com
America 200 2
pute
Kidney M 9/22/ Network 2014- 9- Na 0
1 2 1 NaN 1000 Theft rs
Stone O 2009 Server 05-30 09- N 0
cont
Associat 22 9
aini
ion, LLC
ng
une
ncr
ypte
d ...
Un N S
Name_o Date Type bre bre y
na u t Business_ Individ Location_of Date_Po Su
f_Cover _of_ _of_ ach ach e
me m a Associate uals_A _Breached_ sted_or_ mm
ed_Entit Brea Breac _sta _en a
d: be t _Involved ffected Information Updated ary
y ch h rt d r
0 r e
Alaska
Departm Other
200 2
ent of Portable
A 10/12 2014- Na 9- Na 0
2 3 2 Health NaN 501 Theft Electronic
K /2009 01-23 N 10- N 0
and Device,
12 9
Social Other
Services
A
lapt
op
was
Health lost
Services by
200 2
for an
D 10/9/ 2014- 9- Na 0
3 4 3 Children NaN 3800 Loss Laptop emp
C 2009 01-23 10- N 0
with loye
09 9
Special e
Need... whi
le
in
tran
...
A
shar
ed
Co
mp
uter
L. 200 2
that
Douglas C 9/27/ Desktop 2014- 9- Na 0
4 5 4 NaN 5257 Theft was
Carlson, A 2009 Computer 01-23 09- N 0
use
M.D. 27 9
d
for
bac
kup
was
...
In [53]:
df2.columns = ['id', 'Number', 'Entity', 'State',
'Business_Associate_Involved', 'Individuals_Affected', 'Date_of_Breach',
'Type_of_Breach', 'Location_of_Breached_Information',
'Date_Posted_or_Updated', 'Summary', 'breach_start',
'breach_end', 'year']
In [54]:
df2.shape
Out[54]:
(1055, 14)
In [55]:
df2.dtypes
Out[55]:
id int64
Number int64
Entity object
State object
Business_Associate_Involved object
Individuals_Affected int64
Date_of_Breach object
Type_of_Breach object
Location_of_Breached_Information object
Date_Posted_or_Updated object
Summary object
breach_start object
breach_end object
year int64
dtype: object
In [56]:
df2.isnull().sum()
Out[56]:
id 0
Number 0
Entity 0
State 0
Business_Associate_Involved 784
Individuals_Affected 0
Date_of_Breach 0
Type_of_Breach 0
Location_of_Breached_Information 0
Date_Posted_or_Updated 0
Summary 913
breach_start 0
breach_end 910
year 0
dtype: int64
In [57]:
df2.head()
Out[57]:
Nu
S Business_ Individ Date_ Type_ Location_of_ Date_Pos Sum brea brea y
i m Enti
ta Associate_ uals_Af of_Br of_Br Breached_Inf ted_or_U mar ch_s ch_ e
d be ty
te Involved fected each each ormation pdated y tart end ar
r
A
bind
er
Bro cont
oke aini
Arm ng 200 2
y T 10/16 2014-06- the 9- Na 0
0 1 0 NaN 1000 Theft Paper
Med X /2009 30 prot 10- N 0
ical ecte 16 9
Cent d
er heal
th
info
r...
Five
Mid
desk
Am
top
eric
com
a
pute
Kid 200 2
rs
ney M 9/22/ Network 2014-05- 9- Na 0
1 2 1 NaN 1000 Theft cont
Ston O 2009 Server 30 09- N 0
aini
e 22 9
ng
Ass
une
ocia
ncry
tion,
pted
LLC
...
Alas
ka
Dep
artm
ent Other
200 2
of Portable
A 10/12 2014-01- 9- Na 0
2 3 2 Heal NaN 501 Theft Electronic NaN
K /2009 23 10- N 0
th Device,
12 9
and Other
Soci
al
Serv
ices
op
Serv was
ices lost
for by
Chil an
dren emp 10- 0
with loye 09 9
Spe e
cial whil
Nee e in
d... tran.
..
A
shar
ed
Co
L.
mpu
Dou
ter 200 2
glas
C 9/27/ Desktop 2014-01- that 9- Na 0
4 5 4 Carl NaN 5257 Theft
A 2009 Computer 23 was 09- N 0
son,
used 27 9
M.D
for
.
back
up
was.
..
In [58]:
df2.drop(['Number','Summary', 'Date_Posted_or_Updated', 'breach_start',
'breach_end', 'Business_Associate_Involved'], axis=1, inplace=True)
In [59]:
df2.head()
Out[59]:
i Stat Individuals_Affect Date_of_Brea Type_of_Brea Location_of_Breached_Inform
Entity year
d e ed ch ch ation
Brooke
Army 200
0 1 TX 1000 10/16/2009 Theft Paper
Medical 9
Center
Mid
America
Kidney 200
1 2 MO 1000 9/22/2009 Theft Network Server
Stone 9
Associatio
n, LLC
Alaska
Departme
nt of Other Portable Electronic 200
2 3 AK 501 10/12/2009 Theft
Health and Device, Other 9
Social
Services
Health
Services
for
200
3 4 Children DC 3800 10/9/2009 Loss Laptop
9
with
Special
Need...
L.
Douglas 200
4 5 CA 5257 9/27/2009 Theft Desktop Computer
Carlson, 9
M.D.
In [60]:
df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 1055 non-null int64
1 Entity 1055 non-null object
2 State 1055 non-null object
3 Individuals_Affected 1055 non-null int64
4 Date_of_Breach 1055 non-null object
5 Type_of_Breach 1055 non-null object
6 Location_of_Breached_Information 1055 non-null object
7 year 1055 non-null int64
dtypes: int64(3), object(5)
memory usage: 66.1+ KB
In [61]:
sns.countplot(data=df2, x='year');
plt.title('Data Breaches per Year')
plt.xticks(rotation=90);
In [62]:
sns.countplot(data=df2, x='year', order = df2['year'].value_counts().index);
plt.title('Data Breaches per Year in order')
plt.xticks(rotation=90);
In [63]:
table_year_df2 = df2['year'].value_counts()
table_year_df2
Out[63]:
2013 254
2011 229
2012 227
2010 211
2009 56
2014 56
2008 13
2004 2
2005 2
1997 1
2003 1
2007 1
2006 1
2002 1
Name: year, dtype: int64
In [64]:
sns.countplot(data=df2, x='Type_of_Breach', order =
df2['Type_of_Breach'].value_counts().index);
plt.title('Method')
plt.xticks(rotation=90);
In [65]:
table3 = df2['Type_of_Breach'].value_counts()
table3.head(14)
Out[65]:
Theft 516
Unauthorized Access/Disclosure 148
Other 91
Loss 85
Hacking/IT Incident 75
Improper Disposal 38
Theft, Unauthorized Access/Disclosure 26
Theft, Loss 15
Unknown 10
Unauthorized Access/Disclosure, Hacking/IT Incident 9
Unauthorized Access/Disclosure, Other 8
Loss, Unauthorized Access/Disclosure 5
Theft, Other 5
Theft, Unauthorized Access/Disclosure, Hacking/IT Incident 3
Name: Type_of_Breach, dtype: int64
In [66]:
Type_of_Breach_counts =
df2['Type_of_Breach'].value_counts().rename('Type_of_Breach_counts')
df2_Type_of_Breach = df2.merge(Type_of_Breach_counts.to_frame(),
left_on='Type_of_Breach',
right_index=True)
In [67]:
df2_Type_of_Breach_upper =
df2_Type_of_Breach[df2_Type_of_Breach.Type_of_Breach_counts > 4]
In [68]:
sns.countplot(data=df2_Type_of_Breach_upper, x='Type_of_Breach',
order =
df2_Type_of_Breach_upper['Type_of_Breach'].value_counts().index);
plt.title('Method')
plt.xticks(rotation=90);
In [69]:
table4 = df2['State'].value_counts()
table4
Out[69]:
CA 113
TX 83
FL 66
NY 58
IL 49
PA 40
IN 40
OH 33
TN 32
NC 32
MA 32
PR 31
GA 30
KY 26
MI 26
MO 25
WA 25
AZ 21
MN 21
NJ 20
CO 18
VA 18
MD 18
CT 17
OR 15
WI 14
SC 13
AL 12
AR 11
NM 10
NE 9
UT 9
DC 9
IA 8
LA 7
RI 7
KS 7
OK 6
WV 5
MS 5
NV 5
AK 5
WY 4
NH 4
MT 4
DE 3
ND 3
ID 2
HI 1
SD 1
ME 1
VT 1
Name: State, dtype: int64
In [70]:
State_counts = df2['State'].value_counts().rename('State_counts')
df2_State = df2.merge(State_counts.to_frame(),
left_on='State',
right_index=True)
In [71]:
df2_State_upper = df2_State[df2_State.State_counts >= 15]
In [72]:
plt.figure(figsize=(18,8))
sns.countplot(data=df2_State_upper, x='State', order =
df2_State_upper['State'].value_counts().index);
plt.title('Data Breaches per State')
plt.xticks(rotation=90);
In [73]:
df2_2006 = df2.loc[df2['year']>2006]
In [74]:
plt.figure(figsize=(10,5))
plt.scatter(data = df2_2006, y = 'Individuals_Affected', x = 'year',
alpha=1/2);
In [75]:
plt.figure(figsize=(10,5))
plt.hist2d(data = df2_2006, y = 'Individuals_Affected', x = 'year',
cmin=0.5, cmap = 'icefire')
plt.colorbar();
In [76]:
df2_2006_breach =
df2_2006.loc[df2_2006['Type_of_Breach'].isin(df2_2006['Type_of_Breach'].value
_counts().index[:11])]
plt.figure(figsize=(15,8))
sns.countplot(data = df2_2006_breach, x = 'year', hue = 'Type_of_Breach');
In [78]:
plt.figure(figsize=(10,5))
sns.boxplot(data=df2_2006_breach, y = 'Type_of_Breach', x = 'year');
In [79]:
df2_heatmap = df2.copy(deep=True)
In [80]:
le = LabelEncoder()
df2_heatmap['State'] = le.fit_transform(df2_heatmap['State'])
df2_heatmap['Date_of_Breach'] =
le.fit_transform(df2_heatmap['Date_of_Breach'])
df2_heatmap['Type_of_Breach'] =
le.fit_transform(df2_heatmap['Type_of_Breach'])
df2_heatmap['Location_of_Breached_Information'] =
le.fit_transform(df2_heatmap['Location_of_Breached_Information'])
In [81]:
plt.figure(figsize=(10,5))
sns.heatmap(df2_heatmap[['State', 'Individuals_Affected', 'Date_of_Breach',
'Type_of_Breach',
'Location_of_Breached_Information', 'year']].corr(),
cmap='Spectral_r', annot=True);