Skip to content

Commit 4a4fe0b

Browse files
committed
Merge pull request pandas-dev#10290 from jreback/mi_perf
PERF: improved performance of multiindex slicing
2 parents ad37b5d + 2874420 commit 4a4fe0b

File tree

10 files changed

+156
-40
lines changed

10 files changed

+156
-40
lines changed

doc/source/whatsnew/v0.17.0.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ Performance Improvements
6363

6464
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
6565
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
66+
- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
67+
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
6668

6769
.. _whatsnew_0170.bug_fixes:
6870

pandas/core/common.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2497,6 +2497,10 @@ def is_integer_dtype(arr_or_dtype):
24972497
return (issubclass(tipo, np.integer) and
24982498
not issubclass(tipo, (np.datetime64, np.timedelta64)))
24992499

2500+
def is_int64_dtype(arr_or_dtype):
2501+
tipo = _get_dtype_type(arr_or_dtype)
2502+
return issubclass(tipo, np.int64)
2503+
25002504

25012505
def is_int_or_datetime_dtype(arr_or_dtype):
25022506
tipo = _get_dtype_type(arr_or_dtype)

pandas/core/index.py

Lines changed: 73 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ class Index(IndexOpsMixin, PandasObject):
105105
_is_numeric_dtype = False
106106

107107
_engine_type = _index.ObjectEngine
108+
_isin_type = lib.ismember
108109

109110
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
110111
tupleize_cols=True, **kwargs):
@@ -1838,7 +1839,7 @@ def isin(self, values, level=None):
18381839
value_set = set(values)
18391840
if level is not None:
18401841
self._validate_index_level(level)
1841-
return lib.ismember(np.array(self), value_set)
1842+
return self._isin_type(np.array(self), value_set)
18421843

18431844
def _can_reindex(self, indexer):
18441845
"""
@@ -3379,6 +3380,7 @@ class Int64Index(NumericIndex):
33793380
_outer_indexer = _algos.outer_join_indexer_int64
33803381

33813382
_engine_type = _index.Int64Engine
3383+
_isin_type = lib.ismember_int64
33823384

33833385
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
33843386

@@ -5235,13 +5237,39 @@ def partial_selection(key, indexer=None):
52355237
indexer = self._get_level_indexer(key, level=level)
52365238
return indexer, maybe_droplevels(indexer, [level], drop_level)
52375239

5238-
def _get_level_indexer(self, key, level=0):
5239-
# return a boolean indexer or a slice showing where the key is
5240+
def _get_level_indexer(self, key, level=0, indexer=None):
5241+
# return an indexer, boolean array or a slice showing where the key is
52405242
# in the totality of values
5243+
# if the indexer is provided, then use this
52415244

52425245
level_index = self.levels[level]
52435246
labels = self.labels[level]
52445247

5248+
def convert_indexer(start, stop, step, indexer=indexer, labels=labels):
5249+
# given the inputs and the labels/indexer, compute an indexer set
5250+
# if we have a provided indexer, then this need not consider
5251+
# the entire labels set
5252+
5253+
r = np.arange(start,stop,step)
5254+
if indexer is not None and len(indexer) != len(labels):
5255+
5256+
# we have an indexer which maps the locations in the labels that we
5257+
# have already selected (and is not an indexer for the entire set)
5258+
# otherwise this is wasteful
5259+
# so we only need to examine locations that are in this set
5260+
# the only magic here is that the result are the mappings to the
5261+
# set that we have selected
5262+
from pandas import Series
5263+
mapper = Series(indexer)
5264+
result = Series(Index(labels.take(indexer)).isin(r).nonzero()[0])
5265+
m = result.map(mapper).values
5266+
5267+
else:
5268+
m = np.zeros(len(labels),dtype=bool)
5269+
m[np.in1d(labels,r,assume_unique=True)] = True
5270+
5271+
return m
5272+
52455273
if isinstance(key, slice):
52465274
# handle a slice, returnig a slice if we can
52475275
# otherwise a boolean indexer
@@ -5267,17 +5295,13 @@ def _get_level_indexer(self, key, level=0):
52675295
# a partial date slicer on a DatetimeIndex generates a slice
52685296
# note that the stop ALREADY includes the stopped point (if
52695297
# it was a string sliced)
5270-
m = np.zeros(len(labels),dtype=bool)
5271-
m[np.in1d(labels,np.arange(start.start,stop.stop,step))] = True
5272-
return m
5298+
return convert_indexer(start.start,stop.stop,step)
52735299

52745300
elif level > 0 or self.lexsort_depth == 0 or step is not None:
52755301
# need to have like semantics here to right
52765302
# searching as when we are using a slice
52775303
# so include the stop+1 (so we include stop)
5278-
m = np.zeros(len(labels),dtype=bool)
5279-
m[np.in1d(labels,np.arange(start,stop+1,step))] = True
5280-
return m
5304+
return convert_indexer(start,stop+1,step)
52815305
else:
52825306
# sorted, so can return slice object -> view
52835307
i = labels.searchsorted(start, side='left')
@@ -5315,59 +5339,73 @@ def get_locs(self, tup):
53155339
raise KeyError('MultiIndex Slicing requires the index to be fully lexsorted'
53165340
' tuple len ({0}), lexsort depth ({1})'.format(len(tup), self.lexsort_depth))
53175341

5318-
def _convert_indexer(r):
5342+
# indexer
5343+
# this is the list of all values that we want to select
5344+
n = len(self)
5345+
indexer = None
5346+
5347+
def _convert_to_indexer(r):
5348+
# return an indexer
53195349
if isinstance(r, slice):
5320-
m = np.zeros(len(self),dtype=bool)
5350+
m = np.zeros(n,dtype=bool)
53215351
m[r] = True
5322-
return m
5323-
return r
5352+
r = m.nonzero()[0]
5353+
elif is_bool_indexer(r):
5354+
if len(r) != n:
5355+
raise ValueError("cannot index with a boolean indexer that is"
5356+
" not the same length as the index")
5357+
r = r.nonzero()[0]
5358+
return Int64Index(r)
5359+
5360+
def _update_indexer(idxr, indexer=indexer):
5361+
if indexer is None:
5362+
indexer = Index(np.arange(n))
5363+
if idxr is None:
5364+
return indexer
5365+
return indexer & idxr
53245366

5325-
ranges = []
53265367
for i,k in enumerate(tup):
53275368

53285369
if is_bool_indexer(k):
53295370
# a boolean indexer, must be the same length!
53305371
k = np.asarray(k)
5331-
if len(k) != len(self):
5332-
raise ValueError("cannot index with a boolean indexer that is"
5333-
" not the same length as the index")
5334-
ranges.append(k)
5372+
indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer)
5373+
53355374
elif is_list_like(k):
53365375
# a collection of labels to include from this level (these are or'd)
5337-
indexers = []
5376+
indexers = None
53385377
for x in k:
53395378
try:
5340-
indexers.append(_convert_indexer(self._get_level_indexer(x, level=i)))
5379+
idxrs = _convert_to_indexer(self._get_level_indexer(x, level=i, indexer=indexer))
5380+
indexers = idxrs if indexers is None else indexers | idxrs
53415381
except (KeyError):
53425382

53435383
# ignore not founds
53445384
continue
5345-
if len(k):
5346-
ranges.append(reduce(np.logical_or, indexers))
5385+
5386+
if indexers is not None:
5387+
indexer = _update_indexer(indexers, indexer=indexer)
53475388
else:
5348-
ranges.append(np.zeros(self.labels[i].shape, dtype=bool))
5389+
5390+
# no matches we are done
5391+
return Int64Index([]).values
53495392

53505393
elif is_null_slice(k):
53515394
# empty slice
5352-
pass
5395+
indexer = _update_indexer(None, indexer=indexer)
53535396

53545397
elif isinstance(k,slice):
53555398

53565399
# a slice, include BOTH of the labels
5357-
ranges.append(self._get_level_indexer(k,level=i))
5400+
indexer = _update_indexer(_convert_to_indexer(self._get_level_indexer(k,level=i,indexer=indexer)), indexer=indexer)
53585401
else:
53595402
# a single label
5360-
ranges.append(self.get_loc_level(k,level=i,drop_level=False)[0])
5361-
5362-
# identity
5363-
if len(ranges) == 0:
5364-
return slice(0,len(self))
5365-
5366-
elif len(ranges) == 1:
5367-
return ranges[0]
5403+
indexer = _update_indexer(_convert_to_indexer(self.get_loc_level(k,level=i,drop_level=False)[0]), indexer=indexer)
53685404

5369-
# construct a boolean indexer if we have a slice or boolean indexer
5370-
return reduce(np.logical_and,[ _convert_indexer(r) for r in ranges ])
5405+
# empty indexer
5406+
if indexer is None:
5407+
return Int64Index([]).values
5408+
return indexer.values
53715409

53725410
def truncate(self, before=None, after=None):
53735411
"""

pandas/core/indexing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ def can_do_equal_len():
509509

510510
def _align_series(self, indexer, ser):
511511
# indexer to assign Series can be tuple, slice, scalar
512-
if isinstance(indexer, (slice, np.ndarray, list)):
512+
if isinstance(indexer, (slice, np.ndarray, list, Index)):
513513
indexer = tuple([indexer])
514514

515515
if isinstance(indexer, tuple):
@@ -1719,7 +1719,7 @@ def maybe_convert_ix(*args):
17191719

17201720
ixify = True
17211721
for arg in args:
1722-
if not isinstance(arg, (np.ndarray, list, ABCSeries)):
1722+
if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)):
17231723
ixify = False
17241724

17251725
if ixify:

pandas/core/series.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
is_list_like, _values_from_object,
2020
_possibly_cast_to_datetime, _possibly_castable,
2121
_possibly_convert_platform, _try_sort,
22+
is_int64_dtype,
2223
ABCSparseArray, _maybe_match_name,
2324
_coerce_to_dtype, SettingWithCopyError,
2425
_maybe_box_datetimelike, ABCDataFrame,
@@ -2250,17 +2251,22 @@ def isin(self, values):
22502251

22512252
# may need i8 conversion for proper membership testing
22522253
comps = _values_from_object(self)
2254+
f = lib.ismember
22532255
if com.is_datetime64_dtype(self):
22542256
from pandas.tseries.tools import to_datetime
22552257
values = Series(to_datetime(values)).values.view('i8')
22562258
comps = comps.view('i8')
2259+
f = lib.ismember_int64
22572260
elif com.is_timedelta64_dtype(self):
22582261
from pandas.tseries.timedeltas import to_timedelta
22592262
values = Series(to_timedelta(values)).values.view('i8')
22602263
comps = comps.view('i8')
2264+
f = lib.ismember_int64
2265+
elif is_int64_dtype(self):
2266+
f = lib.ismember_int64
22612267

22622268
value_set = set(values)
2263-
result = lib.ismember(comps, value_set)
2269+
result = f(comps, value_set)
22642270
return self._constructor(result, index=self.index).__finalize__(self)
22652271

22662272
def between(self, left, right, inclusive=True):

pandas/lib.pyx

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,31 @@ def ismember(ndarray arr, set values):
156156

157157
return result.view(np.bool_)
158158

159+
def ismember_int64(ndarray[int64_t] arr, set values):
160+
'''
161+
Checks whether
162+
163+
Parameters
164+
----------
165+
arr : ndarray of int64
166+
values : set
167+
168+
Returns
169+
-------
170+
ismember : ndarray (boolean dtype)
171+
'''
172+
cdef:
173+
Py_ssize_t i, n
174+
ndarray[uint8_t] result
175+
int64_t v
176+
177+
n = len(arr)
178+
result = np.empty(n, dtype=np.uint8)
179+
for i in range(n):
180+
result[i] = arr[i] in values
181+
182+
return result.view(np.bool_)
183+
159184
#----------------------------------------------------------------------
160185
# datetime / io related
161186

pandas/tests/test_indexing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2312,6 +2312,7 @@ def f():
23122312
index=pd.MultiIndex.from_product([['A','B','C'],['foo']],
23132313
names=['one','two'])
23142314
).sortlevel()
2315+
23152316
result = s.loc[idx[:,['foo']]]
23162317
assert_series_equal(result,expected)
23172318
result = s.loc[idx[:,['foo','bah']]]
@@ -2323,9 +2324,9 @@ def f():
23232324
df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index)
23242325
df = df.sortlevel(0, axis=1)
23252326

2327+
expected = DataFrame(index=range(5),columns=multi_index.reindex([])[0])
23262328
result1 = df.loc[:, ([], slice(None))]
23272329
result2 = df.loc[:, (['foo'], [])]
2328-
expected = DataFrame(index=range(5),columns=multi_index.reindex([])[0])
23292330
assert_frame_equal(result1, expected)
23302331
assert_frame_equal(result2, expected)
23312332

pandas/tseries/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ def isin(self, values):
449449
return self.asobject.isin(values)
450450

451451
value_set = set(values.asi8)
452-
return lib.ismember(self.asi8, value_set)
452+
return lib.ismember_int64(self.asi8, value_set)
453453

454454
def shift(self, n, freq=None):
455455
"""

vb_suite/indexing.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,3 +235,33 @@
235235
series_ix_slice = Benchmark("s.ix[:800000]", setup)
236236
series_ix_list_like = Benchmark("s.ix[[800000]]", setup)
237237
series_ix_array = Benchmark("s.ix[np.arange(10000)]", setup)
238+
239+
240+
# multi-index slicing
241+
setup = common_setup + """
242+
np.random.seed(1234)
243+
idx=pd.IndexSlice
244+
n=100000
245+
mdt = pandas.DataFrame()
246+
mdt['A'] = np.random.choice(range(10000,45000,1000), n)
247+
mdt['B'] = np.random.choice(range(10,400), n)
248+
mdt['C'] = np.random.choice(range(1,150), n)
249+
mdt['D'] = np.random.choice(range(10000,45000), n)
250+
mdt['x'] = np.random.choice(range(400), n)
251+
mdt['y'] = np.random.choice(range(25), n)
252+
253+
254+
test_A = 25000
255+
test_B = 25
256+
test_C = 40
257+
test_D = 35000
258+
259+
eps_A = 5000
260+
eps_B = 5
261+
eps_C = 5
262+
eps_D = 5000
263+
mdt2 = mdt.set_index(['A','B','C','D']).sortlevel()
264+
"""
265+
266+
multiindex_slicers = Benchmark('mdt2.loc[idx[test_A-eps_A:test_A+eps_A,test_B-eps_B:test_B+eps_B,test_C-eps_C:test_C+eps_C,test_D-eps_D:test_D+eps_D],:]', setup,
267+
start_date=datetime(2015, 1, 1))

vb_suite/series_methods.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
setup = common_setup + """
88
s1 = Series(np.random.randn(10000))
99
s2 = Series(np.random.randint(1, 10, 10000))
10+
s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
11+
values = [1,2]
12+
s4 = s3.astype('object')
1013
"""
1114

1215
series_nlargest1 = Benchmark('s1.nlargest(3, take_last=True);'
@@ -27,3 +30,10 @@
2730
's2.nsmallest(3, take_last=False)',
2831
setup,
2932
start_date=datetime(2014, 1, 25))
33+
34+
series_isin_int64 = Benchmark('s3.isin(values)',
35+
setup,
36+
start_date=datetime(2014, 1, 25))
37+
series_isin_object = Benchmark('s4.isin(values)',
38+
setup,
39+
start_date=datetime(2014, 1, 25))

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy