coderark
diff --git a/‎doc/developers/utilities.rst
Lines changed: 1 addition & 1 deletion b/‎doc/developers/utilities.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/modules/classes.rst
Lines changed: 1 addition & 0 deletions b/‎doc/modules/classes.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/model_evaluation.rst
Lines changed: 4 additions & 25 deletions b/‎doc/modules/model_evaluation.rst
Lines changed: 4 additions & 25 deletions
diff --git a/‎doc/modules/multiclass.rst
Lines changed: 17 additions & 29 deletions b/‎doc/modules/multiclass.rst
Lines changed: 17 additions & 29 deletions
diff --git a/‎doc/modules/preprocessing.rst
Lines changed: 2 additions & 1 deletion b/‎doc/modules/preprocessing.rst
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/whats_new.rst
Lines changed: 6 additions & 0 deletions b/‎doc/whats_new.rst
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/plot_multilabel.py
Lines changed: 5 additions & 5 deletions b/‎examples/plot_multilabel.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎sklearn/datasets/samples_generator.py
Lines changed: 10 additions & 2 deletions b/‎sklearn/datasets/samples_generator.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎sklearn/datasets/tests/test_samples_generator.py
Lines changed: 5 additions & 4 deletions b/‎sklearn/datasets/tests/test_samples_generator.py
Lines changed: 5 additions & 4 deletions
@@ -244,7 +244,7 @@ Multiclass and multilabel utility function
   a classification output is in label indicator matrix format.
 
 - :func:`multiclass.unique_labels`: Helper function to extract an ordered
-  array of unique labels from a list of labels.
+  array of unique labels from different formats of target.
 
 
 Helper Functions
 
@@ -1061,6 +1061,7 @@ Pairwise metrics
    preprocessing.KernelCenterer
    preprocessing.LabelBinarizer
    preprocessing.LabelEncoder
+   preprocessing.MultiLabelBinarizer
    preprocessing.MinMaxScaler
    preprocessing.Normalizer
    preprocessing.OneHotEncoder
 
@@ -259,16 +259,11 @@ where :math:`1(x)` is the `indicator function
   >>> accuracy_score(y_true, y_pred, normalize=False)
   2
 
-In the multilabel case with binary indicator format:
+In the multilabel case with binary label indicators: ::
 
   >>> accuracy_score(np.array([[0.0, 1.0], [1.0, 1.0]]), np.ones((2, 2)))
   0.5
 
-and with a list of labels format:
-
-  >>> accuracy_score([(1,), (3,)], [(1, 2), tuple()])
-  0.0
-
 .. topic:: Example:
 
   * See :ref:`example_plot_permutation_test_for_classification.py`
@@ -377,16 +372,11 @@ where :math:`1(x)` is the `indicator function
   >>> hamming_loss(y_true, y_pred)
   0.25
 
-In the multilabel case with binary indicator format: ::
+In the multilabel case with binary label indicators: ::
 
   >>> hamming_loss(np.array([[0.0, 1.0], [1.0, 1.0]]), np.zeros((2, 2)))
   0.75
 
-and with a list of labels format: ::
-
-  >>> hamming_loss([(1, 2), (3,)], [(1, 2), tuple()])  # doctest: +ELLIPSIS
-  0.166...
-
 .. note::
 
     In multiclass classification, the Hamming loss correspond to the Hamming
@@ -434,17 +424,11 @@ score is equal to the classification accuracy.
   >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
   2
 
-In the multilabel case with binary indicator format:
+In the multilabel case with binary label indicators: ::
 
   >>> jaccard_similarity_score(np.array([[0.0, 1.0], [1.0, 1.0]]), np.ones((2, 2)))
   0.75
 
-and with a list of labels format:
-
-  >>> jaccard_similarity_score([(1,), (3,)], [(1, 2), tuple()])
-  0.25
-
-
 .. _precision_recall_f_measure_metrics:
 
 Precision, recall and F-measures
@@ -897,16 +881,11 @@ where :math:`1(x)` is the `indicator function
   >>> zero_one_loss(y_true, y_pred, normalize=False)
   1
 
-In the multilabel case with binary indicator format:
+In the multilabel case with binary label indicators: ::
 
   >>> zero_one_loss(np.array([[0.0, 1.0], [1.0, 1.0]]), np.ones((2, 2)))
   0.5
 
-and with a list of labels format:
-
-  >>> zero_one_loss([(1,), (3,)], [(1, 2), tuple()])
-  1.0
-
 
 .. topic:: Example:
 
 
@@ -77,43 +77,31 @@ tasks :ref:`Decision Trees <tree>`, :ref:`Random Forests <forest>`,
 Multilabel classification format
 ================================
 
-In multilabel learning, the joint set of binary classification tasks
-is expressed with either a sequence of sequences or a label binary indicator
-array.
-
-In the sequence of sequences format, each set of labels is represented as
-a sequence of integer, e.g. ``[0]``, ``[1, 2]``. An empty set of labels is
-then expressed as ``[]``, and a set of samples as ``[[0], [1, 2], []]``.
-In the label indicator format, each sample is one row of a 2d array of
-shape (n_samples, n_classes) with binary values: the one, i.e. the non zero
-elements, corresponds to the subset of labels. Our previous example is
-therefore expressed as ``np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0])``
-and an empty set of labels would be represented by a row of zero elements.
-
-
-In the preprocessing module, the transformer
-:class:`sklearn.preprocessing.label_binarize` and the function
-:func:`sklearn.preprocessing.LabelBinarizer`
-can help you to convert the sequence of sequences format to the label
-indicator format.
+In multilabel learning, the joint set of binary classification tasks is
+expressed with label binary indicator array: each sample is one row of a 2d
+array of shape (n_samples, n_classes) with binary values: the one, i.e. the non
+zero elements, corresponds to the subset of labels. An array such as
+``np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0]])`` represents label 0 in the first
+sample, labels 1 and 2 in the second sample, and no labels in the third sample.
+
+Producing multilabel data as a list of sets of labels may be more intuitive.
+The transformer :class:`MultiLabelBinarizer <preprocessing.MultiLabelBinarizer>`
+will convert between a collection of collections of labels and the indicator
+format.
 
   >>> from sklearn.datasets import make_multilabel_classification
-  >>> from sklearn.preprocessing import LabelBinarizer
-  >>> X, Y = make_multilabel_classification(n_samples=5, random_state=0)
+  >>> from sklearn.preprocessing import MultiLabelBinarizer
+  >>> X, Y = make_multilabel_classification(n_samples=5, random_state=0,
+  ...                                       return_indicator=False)
   >>> Y
   ([0, 1, 2], [4, 1, 0, 2], [4, 0, 1], [1, 0], [3, 2])
-  >>> LabelBinarizer().fit_transform(Y)
+  >>> MultiLabelBinarizer().fit_transform(Y)
   array([[1, 1, 1, 0, 0],
          [1, 1, 1, 0, 1],
          [1, 1, 0, 0, 1],
          [1, 1, 0, 0, 0],
          [0, 0, 1, 1, 0]])
 
-.. warning::
-
-    - The sequence of sequences format will disappear in a near future.
-    - Most estimators and functions support both multilabel format.
-
 
 One-Vs-The-Rest
 ===============
@@ -151,8 +139,8 @@ Multilabel learning
 -------------------
 
 :class:`OneVsRestClassifier` also supports multilabel classification.
-To use this feature, feed the classifier a list of tuples containing
-target labels, like in the example below.
+To use this feature, feed the classifier an indicator matrix, in which cell
+[i, j] indicates the presence of label j in sample i.
 
 
 .. figure:: ../auto_examples/images/plot_multilabel_1.png
 
@@ -377,8 +377,9 @@ matrix from a list of multi-class labels::
     array([[1, 0, 0, 0],
            [0, 0, 0, 1]])
 
-:class:`LabelBinarizer` also supports multiple labels per instance::
+For multiple labels per instance, use :class:`MultiLabelBinarizer`::
 
+    >>> lb = preprocessing.MultiLabelBinarizer()
     >>> lb.fit_transform([(1, 2), (3,)])
     array([[1, 1, 0],
            [0, 0, 1]])
 
@@ -235,6 +235,12 @@ API changes summary
    - :class:`cluster.WardClustering` is deprecated. Use
    - :class:`cluster.AgglomerativeClustering` instead.
 
+   - Direct support for the sequence of sequences (or list of lists) multilabel
+     format is deprecated. To convert to and from the supported binary
+     indicator matrix format, use
+     :class:`MultiLabelBinarizer <preprocessing.MultiLabelBinarizer>`.
+     By `Joel Nothman`_.
+
    - Add score method to :class:`PCA <decomposition.PCA>` following the model of
      probabilistic PCA and deprecate
      :class:`ProbabilisticPCA <decomposition.ProbabilisticPCA>` model whose
 
@@ -55,9 +55,7 @@ def plot_subfigure(X, Y, subplot, title, transform):
     if transform == "pca":
         X = PCA(n_components=2).fit_transform(X)
     elif transform == "cca":
-        # Convert list of tuples to a class indicator matrix first
-        Y_indicator = LabelBinarizer().fit(Y).transform(Y)
-        X = CCA(n_components=2).fit(X, Y_indicator).transform(X)
+        X = CCA(n_components=2).fit(X, Y).transform(X)
     else:
         raise ValueError
 
@@ -73,8 +71,8 @@ def plot_subfigure(X, Y, subplot, title, transform):
     pl.subplot(2, 2, subplot)
     pl.title(title)
 
-    zero_class = np.where([0 in y for y in Y])
-    one_class = np.where([1 in y for y in Y])
+    zero_class = np.where(Y[:, 0])
+    one_class = np.where(Y[:, 1])
     pl.scatter(X[:, 0], X[:, 1], s=40, c='gray')
     pl.scatter(X[zero_class, 0], X[zero_class, 1], s=160, edgecolors='b',
                facecolors='none', linewidths=2, label='Class 1')
@@ -100,13 +98,15 @@ def plot_subfigure(X, Y, subplot, title, transform):
 
 X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
                                       allow_unlabeled=True,
+                                      return_indicator=True,
                                       random_state=1)
 
 plot_subfigure(X, Y, 1, "With unlabeled samples + CCA", "cca")
 plot_subfigure(X, Y, 2, "With unlabeled samples + PCA", "pca")
 
 X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
                                       allow_unlabeled=False,
+                                      return_indicator=True,
                                       random_state=1)
 
 plot_subfigure(X, Y, 3, "Without unlabeled samples + CCA", "cca")
 
@@ -7,10 +7,11 @@
 # License: BSD 3 clause
 
 import numbers
+import warnings
 import numpy as np
 from scipy import linalg
 
-from ..preprocessing import LabelBinarizer
+from ..preprocessing import MultiLabelBinarizer
 from ..utils import array2d, check_random_state
 from ..utils import shuffle as util_shuffle
 from ..utils.random import sample_without_replacement
@@ -336,8 +337,15 @@ def sample_example():
     X, Y = zip(*[sample_example() for i in range(n_samples)])
 
     if return_indicator:
-        lb = LabelBinarizer()
+        lb = MultiLabelBinarizer()
         Y = lb.fit([range(n_classes)]).transform(Y)
+    else:
+        warnings.warn('Support for the sequence of sequences multilabel '
+                      'representation is being deprecated and replaced with '
+                      'a sparse indicator matrix. '
+                      'return_indicator wil default to True from version '
+                      '0.17.',
+                      DeprecationWarning)
 
     return np.array(X, dtype=np.float64), Y
 
 
@@ -13,6 +13,7 @@
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_less
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_warns
 
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_multilabel_classification
@@ -131,11 +132,11 @@ def test_make_classification_informative_features():
                   n_clusters_per_class=2)
 
 
-def test_make_multilabel_classification():
+def test_make_multilabel_classification_return_sequences():
     for allow_unlabeled, min_length in zip((True, False), (0, 1)):
-        X, Y = make_multilabel_classification(n_samples=100, n_features=20,
-                                              n_classes=3, random_state=0,
-                                              allow_unlabeled=allow_unlabeled)
+        X, Y = assert_warns(DeprecationWarning, make_multilabel_classification,
+                            n_samples=100, n_features=20, n_classes=3,
+                            random_state=0, allow_unlabeled=allow_unlabeled)
         assert_equal(X.shape, (100, 20), "X shape mismatch")
         if not allow_unlabeled:
             assert_equal(max([max(y) for y in Y]), 2)