Skip to content

Commit 78ccfac

Browse files
author
TomAugspurger
committed
ENH: let get_dummies take a DataFrame
implement via 1d fixup docstring, tests add documentation test for dicts
1 parent b82a4e6 commit 78ccfac

File tree

4 files changed

+241
-6
lines changed

4 files changed

+241
-6
lines changed

doc/source/reshaping.rst

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,49 @@ This function is often used along with discretization functions like ``cut``:
480480
481481
See also :func:`Series.str.get_dummies <pandas.core.strings.StringMethods.get_dummies>`.
482482

483+
.. versionadded:: 0.15.0
484+
485+
:func:`get_dummies` also accepts a DataFrame. By default all categorical
486+
variables (categorical in the statistical sense,
487+
those with `object` or `categorical` dtype) are encoded as dummy variables.
488+
489+
490+
.. ipython:: python
491+
492+
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'],
493+
'C': [1, 2, 3]})
494+
pd.get_dummies(df)
495+
496+
All non-object columns are included untouched in the output.
497+
498+
You can control the columns that are encoded with the ``columns`` keyword.
499+
500+
.. ipython:: python
501+
502+
pd.get_dummies(df, columns=['A'])
503+
504+
Notice that the ``B`` column is still included in the output, it just hasn't
505+
been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't
506+
want to include it in the output.
507+
508+
As with the Series version, you can pass values for the ``prefix`` and
509+
``prefix_sep``. By default the column name is used as the prefix, and '_' as
510+
the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
511+
512+
- string: Use the same value for ``prefix`` or ``prefix_sep`` for each column
513+
to be encoded
514+
- list: Must be the same length as the number of columns being encoded.
515+
- dict: Mapping column name to prefix
516+
517+
.. ipython:: python
518+
519+
simple = pd.get_dummies(df, prefix='new_prefix')
520+
simple
521+
from_list = pd.get_dummies(df, prefix=['from_A', 'from_B'])
522+
from_list
523+
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
524+
from_dict
525+
483526
Factorizing values
484527
------------------
485528

doc/source/v0.15.0.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,15 @@ Enhancements
461461

462462

463463

464+
- The ``get_dummies`` method can now be used on DataFrames. By default only
465+
catagorical columns are encoded as 0's and 1's, while other columns are
466+
left untouched.
464467

468+
.. ipython:: python
469+
470+
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'],
471+
'C': [1, 2, 3]})
472+
pd.get_dummies(df)
465473

466474

467475

pandas/core/reshape.py

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -981,25 +981,34 @@ def convert_dummies(data, cat_variables, prefix_sep='_'):
981981
"""
982982
result = data.drop(cat_variables, axis=1)
983983
for variable in cat_variables:
984-
dummies = get_dummies(data[variable], prefix=variable,
985-
prefix_sep=prefix_sep)
984+
dummies = _get_dummies_1d(data[variable], prefix=variable,
985+
prefix_sep=prefix_sep)
986986
result = result.join(dummies)
987987
return result
988988

989989

990-
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
990+
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
991+
columns=None):
991992
"""
992993
Convert categorical variable into dummy/indicator variables
993994
994995
Parameters
995996
----------
996-
data : array-like or Series
997-
prefix : string, default None
997+
data : array-like, Series, or DataFrame
998+
prefix : string, list of strings, or dict of strings, default None
998999
String to append DataFrame column names
1000+
Pass a list with length equal to the number of columns
1001+
when calling get_dummies on a DataFrame. Alternativly, `prefix`
1002+
can be a dictionary mapping column names to prefixes.
9991003
prefix_sep : string, default '_'
1000-
If appending prefix, separator/delimiter to use
1004+
If appending prefix, separator/delimiter to use. Or pass a
1005+
list or dictionary as with `prefix.`
10011006
dummy_na : bool, default False
10021007
Add a column to indicate NaNs, if False NaNs are ignored.
1008+
columns : list-like, default None
1009+
Column names in the DataFrame to be encoded.
1010+
If `columns` is None then all the columns with
1011+
`object` or `category` dtype will be converted.
10031012
10041013
Returns
10051014
-------
@@ -1031,9 +1040,71 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
10311040
1 0 1 0
10321041
2 0 0 1
10331042
1043+
>>> df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
1044+
'C': [1, 2, 3]})
1045+
1046+
>>> get_dummies(df, prefix=['col1', 'col2']):
1047+
C col1_a col1_b col2_a col2_b col2_c
1048+
0 1 1 0 0 1 0
1049+
1 2 0 1 1 0 0
1050+
2 3 1 0 0 0 1
1051+
10341052
See also ``Series.str.get_dummies``.
10351053
10361054
"""
1055+
from pandas.tools.merge import concat
1056+
from itertools import cycle
1057+
1058+
if isinstance(data, DataFrame):
1059+
# determine columns being encoded
1060+
1061+
if columns is None:
1062+
columns_to_encode = data.select_dtypes(include=['object',
1063+
'category']).columns
1064+
else:
1065+
columns_to_encode = columns
1066+
1067+
# validate prefixes and separator to avoid silently dropping cols
1068+
def check_len(item, name):
1069+
length_msg = ("Length of '{0}' ({1}) did "
1070+
"not match the length of the columns "
1071+
"being encoded ({2}).")
1072+
1073+
if com.is_list_like(item):
1074+
if not len(item) == len(columns_to_encode):
1075+
raise ValueError(length_msg.format(name, len(item),
1076+
len(columns_to_encode)))
1077+
1078+
check_len(prefix, 'prefix')
1079+
check_len(prefix_sep, 'prefix_sep')
1080+
if isinstance(prefix, compat.string_types):
1081+
prefix = cycle([prefix])
1082+
if isinstance(prefix, dict):
1083+
prefix = [prefix[col] for col in columns_to_encode]
1084+
1085+
if prefix is None:
1086+
prefix = columns_to_encode
1087+
1088+
# validate separators
1089+
if isinstance(prefix_sep, compat.string_types):
1090+
prefix_sep = cycle([prefix_sep])
1091+
elif isinstance(prefix_sep, dict):
1092+
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
1093+
1094+
result = data.drop(columns_to_encode, axis=1)
1095+
with_dummies = [result]
1096+
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
1097+
1098+
dummy = _get_dummies_1d(data[col], prefix=pre,
1099+
prefix_sep=sep, dummy_na=dummy_na)
1100+
with_dummies.append(dummy)
1101+
result = concat(with_dummies, axis=1)
1102+
else:
1103+
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na)
1104+
return result
1105+
1106+
1107+
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
10371108
# Series avoids inconsistent NaN handling
10381109
cat = Categorical.from_array(Series(data))
10391110
levels = cat.levels

pandas/tests/test_reshape.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ def test_multiindex(self):
149149

150150

151151
class TestGetDummies(tm.TestCase):
152+
153+
def setUp(self):
154+
self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
155+
'C': [1, 2, 3]})
156+
152157
def test_basic(self):
153158
s_list = list('abc')
154159
s_series = Series(s_list)
@@ -209,6 +214,114 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values
209214
u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}})
210215
assert_frame_equal(res, exp)
211216

217+
def test_dataframe_dummies_all_obj(self):
218+
df = self.df[['A', 'B']]
219+
result = get_dummies(df)
220+
expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0],
221+
'B_b': [1., 1, 0], 'B_c': [0., 0, 1]})
222+
assert_frame_equal(result, expected)
223+
224+
def test_dataframe_dummies_mix_default(self):
225+
df = self.df
226+
result = get_dummies(df)
227+
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
228+
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
229+
'B_c': [0., 0, 1]})
230+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
231+
assert_frame_equal(result, expected)
232+
233+
def test_dataframe_dummies_prefix_list(self):
234+
prefixes = ['from_A', 'from_B']
235+
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
236+
'C': [1, 2, 3]})
237+
result = get_dummies(df, prefix=prefixes)
238+
expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1],
239+
'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0],
240+
'from_B_c': [0., 0, 1]})
241+
expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
242+
'from_B_c']]
243+
assert_frame_equal(result, expected)
244+
245+
def test_datafrmae_dummies_prefix_str(self):
246+
# not that you should do this...
247+
df = self.df
248+
result = get_dummies(df, prefix='bad')
249+
expected = DataFrame([[1, 1., 0., 1., 0.],
250+
[2, 0., 1., 1., 0.],
251+
[3, 1., 0., 0., 1.]],
252+
columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'])
253+
assert_frame_equal(result, expected)
254+
255+
def test_dataframe_dummies_subset(self):
256+
df = self.df
257+
result = get_dummies(df, prefix=['from_A'],
258+
columns=['A'])
259+
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
260+
'B': ['b', 'b', 'c'], 'C': [1, 2, 3]})
261+
assert_frame_equal(result, expected)
262+
263+
def test_dataframe_dummies_prefix_sep(self):
264+
df = self.df
265+
result = get_dummies(df, prefix_sep='..')
266+
expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1],
267+
'A..b': [0., 1, 0], 'B..b': [1., 1, 0],
268+
'B..c': [0., 0, 1]})
269+
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
270+
assert_frame_equal(result, expected)
271+
272+
result = get_dummies(df, prefix_sep=['..', '__'])
273+
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
274+
assert_frame_equal(result, expected)
275+
276+
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'})
277+
assert_frame_equal(result, expected)
278+
279+
def test_dataframe_dummies_prefix_bad_length(self):
280+
with tm.assertRaises(ValueError):
281+
get_dummies(self.df, prefix=['too few'])
282+
283+
def test_dataframe_dummies_prefix_sep_bad_length(self):
284+
with tm.assertRaises(ValueError):
285+
get_dummies(self.df, prefix_sep=['bad'])
286+
287+
def test_dataframe_dummies_prefix_dict(self):
288+
prefixes = {'A': 'from_A', 'B': 'from_B'}
289+
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
290+
'C': [1, 2, 3]})
291+
result = get_dummies(df, prefix=prefixes)
292+
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
293+
'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1],
294+
'C': [1, 2, 3]})
295+
assert_frame_equal(result, expected)
296+
297+
def test_dataframe_dummies_with_na(self):
298+
df = self.df
299+
df.loc[3, :] = [np.nan, np.nan, np.nan]
300+
result = get_dummies(df, dummy_na=True)
301+
expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0],
302+
'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0],
303+
'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]})
304+
expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c',
305+
'B_nan']]
306+
assert_frame_equal(result, expected)
307+
308+
result = get_dummies(df, dummy_na=False)
309+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
310+
assert_frame_equal(result, expected)
311+
312+
def test_dataframe_dummies_with_categorical(self):
313+
df = self.df
314+
df['cat'] = pd.Categorical(['x', 'y', 'y'])
315+
result = get_dummies(df)
316+
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
317+
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
318+
'B_c': [0., 0, 1], 'cat_x': [1., 0, 0],
319+
'cat_y': [0., 1, 1]})
320+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c',
321+
'cat_x', 'cat_y']]
322+
assert_frame_equal(result, expected)
323+
324+
212325
class TestConvertDummies(tm.TestCase):
213326
def test_convert_dummies(self):
214327
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy