Skip to content

Commit c62238a

Browse files
committed
Directly compute polynomial features.
Polynomial features are computed by iterating over all combinations of features. For each combination of features, the product of the columns indexed by the combination is computed. The fit method is now a no-op, and the transform method works with any number of features (regardless of what fit was called with). Conflicts: doc/whats_new.rst
1 parent 5b2aba1 commit c62238a

File tree

3 files changed

+47
-16
lines changed

3 files changed

+47
-16
lines changed

doc/whats_new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,8 @@ Enhancements
201201
- The outcome of :func:`manifold.spectral_embedding` was made deterministic
202202
by flipping the sign of eigen vectors. By `Hasil Sharma`_.
203203

204+
- Significant performance and memory usage improvements in
205+
:class:`preprocessing.PolynomialFeatures`. By `Eric Martin`_.
204206

205207
- Numerical stability improvements for :class:`preprocessing.StandardScaler`
206208
and :func:`preprocessing.scale`. By `Nicolas Goix`_
@@ -3350,3 +3352,7 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
33503352
.. _Eric Schubert: https://github.com/kno10
33513353

33523354
.. _Nicolas Goix: https://webperso.telecom-paristech.fr/front/frontoffice.php?SP_ID=241
3355+
3356+
.. _Dan Blanchard: https://github.com/dan-blanchard
3357+
3358+
.. _Eric Martin: http://ericmart.in

sklearn/preprocessing/data.py

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Mathieu Blondel <mathieu@mblondel.org>
33
# Olivier Grisel <olivier.grisel@ensta.org>
44
# Andreas Mueller <amueller@ais.uni-bonn.de>
5+
# Eric Martin <eric@ericmart.in>
56
# License: BSD 3 clause
67

78
from itertools import chain, combinations
@@ -471,9 +472,16 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin):
471472
472473
Attributes
473474
----------
475+
powers_ : array, shape (n_input_features, n_output_features)
476+
powers_[i, j] is the exponent of the jth input in the ith output.
474477
475-
powers_ :
476-
powers_[i, j] is the exponent of the jth input in the ith output.
478+
n_input_features_ : int
479+
The total number of input features.
480+
481+
n_output_features_ : int
482+
The total number of polynomial output features. The number of output
483+
features is computed by iterating over all suitably sized combinations
484+
of input features.
477485
478486
Notes
479487
-----
@@ -490,23 +498,32 @@ def __init__(self, degree=2, interaction_only=False, include_bias=True):
490498
self.include_bias = include_bias
491499

492500
@staticmethod
493-
def _power_matrix(n_features, degree, interaction_only, include_bias):
494-
"""Compute the matrix of polynomial powers"""
501+
def _combinations(n_features, degree, interaction_only, include_bias):
495502
comb = (combinations if interaction_only else combinations_w_r)
496503
start = int(not include_bias)
497-
combn = chain.from_iterable(comb(range(n_features), i)
498-
for i in range(start, degree + 1))
499-
powers = np.vstack(bincount(c, minlength=n_features) for c in combn)
500-
return powers
504+
return chain.from_iterable(comb(range(n_features), i)
505+
for i in range(start, degree + 1))
506+
507+
@property
508+
def powers_(self):
509+
check_is_fitted(self, 'n_input_features_')
510+
511+
combinations = self._combinations(self.n_input_features_, self.degree,
512+
self.interaction_only,
513+
self.include_bias)
514+
return np.vstack(np.bincount(c, minlength=self.n_input_features_)
515+
for c in combinations)
501516

502517
def fit(self, X, y=None):
503518
"""
504-
Compute the polynomial feature combinations
519+
Compute number of output features.
505520
"""
506521
n_samples, n_features = check_array(X).shape
507-
self.powers_ = self._power_matrix(n_features, self.degree,
522+
combinations = self._combinations(n_features, self.degree,
508523
self.interaction_only,
509524
self.include_bias)
525+
self.n_input_features_ = n_features
526+
self.n_output_features_ = sum(1 for _ in combinations)
510527
return self
511528

512529
def transform(self, X, y=None):
@@ -523,15 +540,24 @@ def transform(self, X, y=None):
523540
The matrix of features, where NP is the number of polynomial
524541
features generated from the combination of inputs.
525542
"""
526-
check_is_fitted(self, 'powers_')
543+
check_is_fitted(self, ['n_input_features_', 'n_output_features_'])
527544

528545
X = check_array(X)
529546
n_samples, n_features = X.shape
530547

531-
if n_features != self.powers_.shape[1]:
548+
if n_features != self.n_input_features_:
532549
raise ValueError("X shape does not match training shape")
533550

534-
return (X[:, None, :] ** self.powers_).prod(-1)
551+
# allocate output data
552+
XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype)
553+
554+
combinations = self._combinations(n_features, self.degree,
555+
self.interaction_only,
556+
self.include_bias)
557+
for i, c in enumerate(combinations):
558+
XP[:, i] = X[:, c].prod(1)
559+
560+
return XP
535561

536562

537563
def normalize(X, norm='l2', axis=1, copy=True):
@@ -1112,7 +1138,8 @@ def _transform(self, X):
11121138
# We use only those catgorical features of X that are known using fit.
11131139
# i.e lesser than n_values_ using mask.
11141140
# This means, if self.handle_unknown is "ignore", the row_indices and
1115-
# col_indices corresponding to the unknown categorical feature are ignored.
1141+
# col_indices corresponding to the unknown categorical feature are
1142+
# ignored.
11161143
mask = (X < self.n_values_).ravel()
11171144
if np.any(~mask):
11181145
if self.handle_unknown not in ['error', 'ignore']:

sklearn/preprocessing/tests/test_data.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,6 @@ def test_polynomial_features():
7070
X_poly = interact.fit_transform(X)
7171
assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
7272

73-
assert_raises(ValueError, interact.transform, X[:, 1:])
74-
7573

7674
def test_scaler_1d():
7775
"""Test scaling of dataset along single axis"""

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy