Skip to content

Commit 711ad3e

Browse files
committed
BUG: coerce Categorical in record array creation (GH8626)
BUG: Categorical not created properly with to_frame() from Series (GH8626) BUG: handle astype with passed pd.Categorical (GH8626)
1 parent a30d6ee commit 711ad3e

File tree

6 files changed

+169
-20
lines changed

6 files changed

+169
-20
lines changed

doc/source/whatsnew/v0.15.1.txt

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,41 @@ Experimental
4848
Bug Fixes
4949
~~~~~~~~~
5050

51+
52+
- Bug in coercing ``Categorical` to a records array, e.g. ``df.to_records()`` (:issue:`8626)
53+
- Bug in ``Categorical`` not created properly with ``Series.to_frame()`` (:issue:`8626`)
54+
- Bug in coercing in astype of a ``Categorical`` of a passed ``pd.Categorical`` (this now raises ``TypeError`` correctly), (:issue:`8626`)
5155
- Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`)
56+
57+
58+
59+
60+
61+
62+
63+
64+
5265
- Bug in numeric index operations of add/sub with Float/Index Index with numpy arrays (:issue:`8608`)
66+
67+
68+
69+
70+
71+
72+
5373
- Bug in ix/loc block splitting on setitem (manifests with integer-like dtypes, e.g. datetime64) (:issue:`8607`)
74+
75+
76+
77+
78+
79+
80+
81+
82+
83+
84+
85+
86+
87+
5488
- Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`)

pandas/core/categorical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ class Categorical(PandasObject):
187187

188188
# For comparisons, so that numpy uses our implementation if the compare ops, which raise
189189
__array_priority__ = 1000
190+
_typ = 'categorical'
190191
ordered = False
191192
name = None
192193

@@ -1464,4 +1465,3 @@ def _convert_to_list_like(list_like):
14641465
else:
14651466
# is this reached?
14661467
return [list_like]
1467-

pandas/core/common.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,10 @@ class AmbiguousIndexError(PandasError, KeyError):
5656
def create_pandas_abc_type(name, attr, comp):
5757
@classmethod
5858
def _check(cls, inst):
59-
return getattr(inst, attr, None) in comp
59+
result = getattr(inst, attr, None)
60+
if result is None:
61+
return False
62+
return result in comp
6063
dct = dict(__instancecheck__=_check,
6164
__subclasscheck__=_check)
6265
meta = type("ABCBase", (type,), dct)
@@ -78,6 +81,7 @@ def _check(cls, inst):
7881
'sparse_time_series'))
7982
ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp",
8083
('sparse_array', 'sparse_series'))
84+
ABCCategorical = create_pandas_abc_type("ABCCategorical","_typ",("categorical"))
8185

8286

8387
class _ABCGeneric(type):

pandas/core/frame.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
2727
_default_index, _maybe_upcast, _is_sequence,
2828
_infer_dtype_from_scalar, _values_from_object,
29-
is_list_like, _get_dtype, _maybe_box_datetimelike)
29+
is_list_like, _get_dtype, _maybe_box_datetimelike,
30+
is_categorical_dtype)
3031
from pandas.core.generic import NDFrame, _shared_docs
3132
from pandas.core.index import Index, MultiIndex, _ensure_index
3233
from pandas.core.indexing import (_maybe_droplevels,
@@ -332,6 +333,8 @@ def _init_dict(self, data, index, columns, dtype=None):
332333

333334
def _init_ndarray(self, values, index, columns, dtype=None,
334335
copy=False):
336+
# input must be a ndarray, list, Series, index
337+
335338
if isinstance(values, Series):
336339
if columns is None:
337340
if values.name is not None:
@@ -345,9 +348,41 @@ def _init_ndarray(self, values, index, columns, dtype=None,
345348
if not len(values) and columns is not None and len(columns):
346349
values = np.empty((0, 1), dtype=object)
347350

351+
# helper to create the axes as indexes
352+
def _get_axes(N, K, index=index, columns=columns):
353+
# return axes or defaults
354+
355+
if index is None:
356+
index = _default_index(N)
357+
else:
358+
index = _ensure_index(index)
359+
360+
if columns is None:
361+
columns = _default_index(K)
362+
else:
363+
columns = _ensure_index(columns)
364+
return index, columns
365+
366+
# we could have a categorical type passed or coerced to 'category'
367+
# recast this to an _arrays_to_mgr
368+
if is_categorical_dtype(getattr(values,'dtype',None)) or is_categorical_dtype(dtype):
369+
370+
if not hasattr(values,'dtype'):
371+
values = _prep_ndarray(values, copy=copy)
372+
values = values.ravel()
373+
elif copy:
374+
values = values.copy()
375+
376+
index, columns = _get_axes(len(values),1)
377+
return _arrays_to_mgr([ values ], columns, index, columns,
378+
dtype=dtype)
379+
380+
# by definition an array here
381+
# the dtypes will be coerced to a single dtype
348382
values = _prep_ndarray(values, copy=copy)
349383

350384
if dtype is not None:
385+
351386
if values.dtype != dtype:
352387
try:
353388
values = values.astype(dtype)
@@ -356,18 +391,7 @@ def _init_ndarray(self, values, index, columns, dtype=None,
356391
% (dtype, orig))
357392
raise_with_traceback(e)
358393

359-
N, K = values.shape
360-
361-
if index is None:
362-
index = _default_index(N)
363-
else:
364-
index = _ensure_index(index)
365-
366-
if columns is None:
367-
columns = _default_index(K)
368-
else:
369-
columns = _ensure_index(columns)
370-
394+
index, columns = _get_axes(*values.shape)
371395
return create_block_manager_from_blocks([values.T], [columns, index])
372396

373397
@property
@@ -877,7 +901,7 @@ def to_records(self, index=True, convert_datetime64=True):
877901
else:
878902
ix_vals = [self.index.values]
879903

880-
arrays = ix_vals + [self[c].values for c in self.columns]
904+
arrays = ix_vals + [self[c].get_values() for c in self.columns]
881905

882906
count = 0
883907
index_names = list(self.index.names)
@@ -890,7 +914,7 @@ def to_records(self, index=True, convert_datetime64=True):
890914
index_names = ['index']
891915
names = index_names + lmap(str, self.columns)
892916
else:
893-
arrays = [self[c].values for c in self.columns]
917+
arrays = [self[c].get_values() for c in self.columns]
894918
names = lmap(str, self.columns)
895919

896920
dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)])
@@ -4729,6 +4753,7 @@ def convert(v):
47294753
values = convert(values)
47304754

47314755
else:
4756+
47324757
# drop subclass info, do not copy data
47334758
values = np.asarray(values)
47344759
if copy:

pandas/core/internals.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,21 @@ def is_datelike(self):
9292
""" return True if I am a non-datelike """
9393
return self.is_datetime or self.is_timedelta
9494

95+
def is_categorical_astype(self, dtype):
96+
"""
97+
validate that we have a astypeable to categorical,
98+
returns a boolean if we are a categorical
99+
"""
100+
if com.is_categorical_dtype(dtype):
101+
if dtype == com.CategoricalDtype():
102+
return True
103+
104+
# this is a pd.Categorical, but is not
105+
# a valid type for astypeing
106+
raise TypeError("invalid type {0} for astype".format(dtype))
107+
108+
return False
109+
95110
def to_dense(self):
96111
return self.values.view()
97112

@@ -345,7 +360,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
345360

346361
# may need to convert to categorical
347362
# this is only called for non-categoricals
348-
if com.is_categorical_dtype(dtype):
363+
if self.is_categorical_astype(dtype):
349364
return make_block(Categorical(self.values),
350365
ndim=self.ndim,
351366
placement=self.mgr_locs)
@@ -1682,7 +1697,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
16821697
raise on an except if raise == True
16831698
"""
16841699

1685-
if dtype == com.CategoricalDtype():
1700+
if self.is_categorical_astype(dtype):
16861701
values = self.values
16871702
else:
16881703
values = np.array(self.values).astype(dtype)

pandas/tests/test_categorical.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1072,6 +1072,41 @@ def test_construction_series(self):
10721072
df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index)
10731073
tm.assert_frame_equal(df, expected)
10741074

1075+
def test_construction_frame(self):
1076+
1077+
# GH8626
1078+
1079+
# dict creation
1080+
df = DataFrame({ 'A' : list('abc') },dtype='category')
1081+
expected = Series(list('abc'),dtype='category')
1082+
tm.assert_series_equal(df['A'],expected)
1083+
1084+
# to_frame
1085+
s = Series(list('abc'),dtype='category')
1086+
result = s.to_frame()
1087+
expected = Series(list('abc'),dtype='category')
1088+
tm.assert_series_equal(result[0],expected)
1089+
result = s.to_frame(name='foo')
1090+
expected = Series(list('abc'),dtype='category')
1091+
tm.assert_series_equal(result['foo'],expected)
1092+
1093+
# list-like creation
1094+
df = DataFrame(list('abc'),dtype='category')
1095+
expected = Series(list('abc'),dtype='category')
1096+
tm.assert_series_equal(df[0],expected)
1097+
1098+
# these coerces back to object as its spread across columns
1099+
1100+
# ndim != 1
1101+
df = DataFrame([pd.Categorical(list('abc'))])
1102+
expected = DataFrame([list('abc')])
1103+
tm.assert_frame_equal(df,expected)
1104+
1105+
# mixed
1106+
df = DataFrame([pd.Categorical(list('abc')),list('def')])
1107+
expected = DataFrame([list('abc'),list('def')])
1108+
tm.assert_frame_equal(df,expected)
1109+
10751110
def test_reindex(self):
10761111

10771112
index = pd.date_range('20000101', periods=3)
@@ -2223,6 +2258,42 @@ def cmp(a,b):
22232258
# array conversion
22242259
tm.assert_almost_equal(np.array(s),np.array(s.values))
22252260

2261+
# valid conversion
2262+
for valid in [lambda x: x.astype('category'),
2263+
lambda x: x.astype(com.CategoricalDtype()),
2264+
lambda x: x.astype('object').astype('category'),
2265+
lambda x: x.astype('object').astype(com.CategoricalDtype())]:
2266+
2267+
result = valid(s)
2268+
tm.assert_series_equal(result,s)
2269+
2270+
# invalid conversion (these are NOT a dtype)
2271+
for invalid in [lambda x: x.astype(pd.Categorical),
2272+
lambda x: x.astype('object').astype(pd.Categorical)]:
2273+
self.assertRaises(TypeError, lambda : invalid(s))
2274+
2275+
2276+
def test_to_records(self):
2277+
2278+
# GH8626
2279+
2280+
# dict creation
2281+
df = DataFrame({ 'A' : list('abc') },dtype='category')
2282+
expected = Series(list('abc'),dtype='category')
2283+
tm.assert_series_equal(df['A'],expected)
2284+
2285+
# list-like creation
2286+
df = DataFrame(list('abc'),dtype='category')
2287+
expected = Series(list('abc'),dtype='category')
2288+
tm.assert_series_equal(df[0],expected)
2289+
2290+
# to record array
2291+
# this coerces
2292+
result = df.to_records()
2293+
expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
2294+
dtype=[('index', '<i8'), ('0', 'O')])
2295+
tm.assert_almost_equal(result,expected)
2296+
22262297
def test_numeric_like_ops(self):
22272298

22282299
# numeric ops should not succeed
@@ -2262,7 +2333,7 @@ def get_dir(s):
22622333

22632334
def test_pickle_v0_14_1(self):
22642335
cat = pd.Categorical(values=['a', 'b', 'c'],
2265-
levels=['a', 'b', 'c', 'd'],
2336+
categories=['a', 'b', 'c', 'd'],
22662337
name='foobar', ordered=False)
22672338
pickle_path = os.path.join(tm.get_data_path(),
22682339
'categorical_0_14_1.pickle')

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy