Directly compute polynomial features.

amueller · amueller · commit c62238a91fd8 · 2015-03-25T17:34:04.000-04:00
Polynomial features are computed by iterating over all combinations
of features. For each combination of features, the product of the
columns indexed by the combination is computed.

The fit method is now a no-op, and the transform method works with any
number of features (regardless of what fit was called with).

Conflicts:
	doc/whats_new.rst
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -201,6 +201,8 @@ Enhancements
    - The outcome of :func:`manifold.spectral_embedding` was made deterministic
      by flipping the sign of eigen vectors. By `Hasil Sharma`_.
 
+   - Significant performance and memory usage improvements in
+     :class:`preprocessing.PolynomialFeatures`. By `Eric Martin`_.
 
    - Numerical stability improvements for :class:`preprocessing.StandardScaler`
      and :func:`preprocessing.scale`. By `Nicolas Goix`_
@@ -3350,3 +3352,7 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Eric Schubert: https://github.com/kno10
 
 .. _Nicolas Goix: https://webperso.telecom-paristech.fr/front/frontoffice.php?SP_ID=241
+
+.. _Dan Blanchard: https://github.com/dan-blanchard
+
+.. _Eric Martin: http://ericmart.in
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -2,6 +2,7 @@
 #          Mathieu Blondel <mathieu@mblondel.org>
 #          Olivier Grisel <olivier.grisel@ensta.org>
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
+#          Eric Martin <eric@ericmart.in>
 # License: BSD 3 clause
 
 from itertools import chain, combinations
@@ -471,9 +472,16 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
+    powers_ : array, shape (n_input_features, n_output_features)
+        powers_[i, j] is the exponent of the jth input in the ith output.
 
-    powers_ :
-         powers_[i, j] is the exponent of the jth input in the ith output.
+    n_input_features_ : int
+        The total number of input features.
+
+    n_output_features_ : int
+        The total number of polynomial output features. The number of output
+        features is computed by iterating over all suitably sized combinations
+        of input features.
 
     Notes
     -----
@@ -490,23 +498,32 @@ def __init__(self, degree=2, interaction_only=False, include_bias=True):
         self.include_bias = include_bias
 
     @staticmethod
-    def _power_matrix(n_features, degree, interaction_only, include_bias):
-        """Compute the matrix of polynomial powers"""
+    def _combinations(n_features, degree, interaction_only, include_bias):
         comb = (combinations if interaction_only else combinations_w_r)
         start = int(not include_bias)
-        combn = chain.from_iterable(comb(range(n_features), i)
-                                    for i in range(start, degree + 1))
-        powers = np.vstack(bincount(c, minlength=n_features) for c in combn)
-        return powers
+        return chain.from_iterable(comb(range(n_features), i)
+                                   for i in range(start, degree + 1))
+
+    @property
+    def powers_(self):
+        check_is_fitted(self, 'n_input_features_')
+
+        combinations = self._combinations(self.n_input_features_, self.degree,
+                                          self.interaction_only,
+                                          self.include_bias)
+        return np.vstack(np.bincount(c, minlength=self.n_input_features_)
+                         for c in combinations)
 
     def fit(self, X, y=None):
         """
-        Compute the polynomial feature combinations
+        Compute number of output features.
         """
         n_samples, n_features = check_array(X).shape
-        self.powers_ = self._power_matrix(n_features, self.degree,
+        combinations = self._combinations(n_features, self.degree,
                                           self.interaction_only,
                                           self.include_bias)
+        self.n_input_features_ = n_features
+        self.n_output_features_ = sum(1 for _ in combinations)
         return self
 
     def transform(self, X, y=None):
@@ -523,15 +540,24 @@ def transform(self, X, y=None):
             The matrix of features, where NP is the number of polynomial
             features generated from the combination of inputs.
         """
-        check_is_fitted(self, 'powers_')
+        check_is_fitted(self, ['n_input_features_', 'n_output_features_'])
 
         X = check_array(X)
         n_samples, n_features = X.shape
 
-        if n_features != self.powers_.shape[1]:
+        if n_features != self.n_input_features_:
             raise ValueError("X shape does not match training shape")
 
-        return (X[:, None, :] ** self.powers_).prod(-1)
+        # allocate output data
+        XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype)
+
+        combinations = self._combinations(n_features, self.degree,
+                                          self.interaction_only,
+                                          self.include_bias)
+        for i, c in enumerate(combinations):
+            XP[:, i] = X[:, c].prod(1)
+
+        return XP
 
 
 def normalize(X, norm='l2', axis=1, copy=True):
@@ -1112,7 +1138,8 @@ def _transform(self, X):
         # We use only those catgorical features of X that are known using fit.
         # i.e lesser than n_values_ using mask.
         # This means, if self.handle_unknown is "ignore", the row_indices and
-        # col_indices corresponding to the unknown categorical feature are ignored.
+        # col_indices corresponding to the unknown categorical feature are
+        # ignored.
         mask = (X < self.n_values_).ravel()
         if np.any(~mask):
             if self.handle_unknown not in ['error', 'ignore']:
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -70,8 +70,6 @@ def test_polynomial_features():
     X_poly = interact.fit_transform(X)
     assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
 
-    assert_raises(ValueError, interact.transform, X[:, 1:])
-
 
 def test_scaler_1d():
     """Test scaling of dataset along single axis"""