Skip to content

Commit 96436df

Browse files
committed
Merge pull request scikit-learn#3161 from hamsal/ada-sparse
[MRG] AdaBoost Sparse Input Support
2 parents 795f377 + 32f0069 commit 96436df

File tree

2 files changed

+241
-47
lines changed

2 files changed

+241
-47
lines changed

sklearn/ensemble/tests/test_weight_boosting.py

100644100755
Lines changed: 165 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,25 @@
1-
"""
2-
Testing for the boost module (sklearn.ensemble.boost).
3-
"""
1+
"""Testing for the boost module (sklearn.ensemble.boost)."""
42

53
import numpy as np
64
from numpy.testing import assert_array_equal, assert_array_less
75
from numpy.testing import assert_array_almost_equal
86
from numpy.testing import assert_equal
97
from nose.tools import assert_raises
108

9+
from sklearn.cross_validation import train_test_split
1110
from sklearn.grid_search import GridSearchCV
1211
from sklearn.ensemble import AdaBoostClassifier
1312
from sklearn.ensemble import AdaBoostRegressor
13+
from scipy.sparse import csc_matrix
14+
from scipy.sparse import csr_matrix
15+
from scipy.sparse import coo_matrix
16+
from scipy.sparse import dok_matrix
17+
from scipy.sparse import lil_matrix
18+
from sklearn.svm import SVC, SVR
1419
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
1520
from sklearn.utils import shuffle
1621
from sklearn import datasets
22+
import time
1723

1824

1925
# Common random state
@@ -215,6 +221,10 @@ def test_error():
215221
AdaBoostClassifier(algorithm="foo").fit,
216222
X, y_class)
217223

224+
assert_raises(ValueError,
225+
AdaBoostClassifier().fit,
226+
X, y_class, sample_weight=np.asarray([-1]))
227+
218228

219229
def test_base_estimator():
220230
"""Test different base estimators."""
@@ -239,6 +249,158 @@ def test_base_estimator():
239249
clf.fit(X, y_regr)
240250

241251

252+
def test_sparse_classification():
253+
"""Check classification with sparse input."""
254+
255+
class CustomSVC(SVC):
256+
"""SVC variant that records the nature of the training set."""
257+
258+
def fit(self, X, y, sample_weight=None):
259+
"""Modification on fit caries data type for later verification."""
260+
super(CustomSVC, self).fit(X, y, sample_weight=sample_weight)
261+
self.data_type_ = type(X)
262+
return self
263+
264+
X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=100,
265+
n_features=50,
266+
return_indicator=True,
267+
random_state=42)
268+
# Flatten y to a 1d array
269+
y = np.ravel(y)
270+
271+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
272+
273+
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
274+
dok_matrix]:
275+
X_train_sparse = sparse_format(X_train)
276+
X_test_sparse = sparse_format(X_test)
277+
278+
# Trained on sparse format
279+
sparse_classifier = AdaBoostClassifier(
280+
base_estimator=CustomSVC(probability=True),
281+
random_state=1,
282+
algorithm="SAMME"
283+
).fit(X_train_sparse, y_train)
284+
285+
# Trained on dense format
286+
dense_classifier = AdaBoostClassifier(
287+
base_estimator=CustomSVC(probability=True),
288+
random_state=1,
289+
algorithm="SAMME"
290+
).fit(X_train, y_train)
291+
292+
# predict
293+
sparse_results = sparse_classifier.predict(X_test_sparse)
294+
dense_results = dense_classifier.predict(X_test)
295+
assert_array_equal(sparse_results, dense_results)
296+
sparse_y_pred, dense_y_pred = sparse_results, dense_results
297+
298+
# decision_function
299+
sparse_results = sparse_classifier.decision_function(X_test_sparse)
300+
dense_results = dense_classifier.decision_function(X_test)
301+
assert_array_equal(sparse_results, dense_results)
302+
303+
# predict_log_proba
304+
sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
305+
dense_results = dense_classifier.predict_log_proba(X_test)
306+
assert_array_equal(sparse_results, dense_results)
307+
308+
# predict_proba
309+
sparse_results = sparse_classifier.predict_proba(X_test_sparse)
310+
dense_results = dense_classifier.predict_proba(X_test)
311+
assert_array_equal(sparse_results, dense_results)
312+
313+
# score
314+
sparse_results = sparse_classifier.score(X_test_sparse, y_test)
315+
dense_results = dense_classifier.score(X_test, y_test)
316+
assert_array_equal(sparse_results, dense_results)
317+
318+
# staged_decision_function
319+
sparse_results = sparse_classifier.staged_decision_function(
320+
X_test_sparse)
321+
dense_results = dense_classifier.staged_decision_function(X_test)
322+
for sprase_res, dense_res in zip(sparse_results, dense_results):
323+
assert_array_equal(sprase_res, dense_res)
324+
325+
# staged_predict
326+
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
327+
dense_results = dense_classifier.staged_predict(X_test)
328+
for sprase_res, dense_res in zip(sparse_results, dense_results):
329+
assert_array_equal(sprase_res, dense_res)
330+
331+
# staged_predict_proba
332+
sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
333+
dense_results = dense_classifier.staged_predict_proba(X_test)
334+
for sprase_res, dense_res in zip(sparse_results, dense_results):
335+
assert_array_equal(sprase_res, dense_res)
336+
337+
# staged_score
338+
sparse_results = sparse_classifier.staged_score(X_test_sparse,
339+
y_test)
340+
dense_results = dense_classifier.staged_score(X_test, y_test)
341+
for sprase_res, dense_res in zip(sparse_results, dense_results):
342+
assert_array_equal(sprase_res, dense_res)
343+
344+
# Verify sparsity of data is maintained during training
345+
sparse_type = type(X_train_sparse)
346+
types = [i.data_type_ for i in sparse_classifier.estimators_]
347+
348+
assert all([(t == csc_matrix or t == csr_matrix)
349+
for t in types])
350+
351+
352+
def test_sparse_regression():
353+
"""Check regression with sparse input."""
354+
355+
class CustomSVR(SVR):
356+
"""SVR variant that records the nature of the training set."""
357+
358+
def fit(self, X, y, sample_weight=None):
359+
"""Modification on fit caries data type for later verification."""
360+
super(CustomSVR, self).fit(X, y, sample_weight=sample_weight)
361+
self.data_type_ = type(X)
362+
return self
363+
364+
X, y = datasets.make_regression(n_samples=100, n_features=50, n_targets=1,
365+
random_state=42)
366+
367+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
368+
369+
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
370+
dok_matrix]:
371+
X_train_sparse = sparse_format(X_train)
372+
X_test_sparse = sparse_format(X_test)
373+
374+
# Trained on sparse format
375+
sparse_classifier = AdaBoostRegressor(
376+
base_estimator=CustomSVR(probability=True),
377+
random_state=1
378+
).fit(X_train_sparse, y_train)
379+
380+
# Trained on dense format
381+
dense_classifier = dense_results = AdaBoostRegressor(
382+
base_estimator=CustomSVR(probability=True),
383+
random_state=1
384+
).fit(X_train, y_train)
385+
386+
# predict
387+
sparse_results = sparse_classifier.predict(X_test_sparse)
388+
dense_results = dense_classifier.predict(X_test)
389+
assert_array_equal(sparse_results, dense_results)
390+
391+
# staged_predict
392+
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
393+
dense_results = dense_classifier.staged_predict(X_test)
394+
for sprase_res, dense_res in zip(sparse_results, dense_results):
395+
assert_array_equal(sprase_res, dense_res)
396+
397+
sparse_type = type(X_train_sparse)
398+
types = [i.data_type_ for i in sparse_classifier.estimators_]
399+
400+
assert all([(t == csc_matrix or t == csr_matrix)
401+
for t in types])
402+
403+
242404
if __name__ == "__main__":
243405
import nose
244406
nose.runmodule()

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy