1
- """
2
- Testing for the boost module (sklearn.ensemble.boost).
3
- """
1
+ """Testing for the boost module (sklearn.ensemble.boost)."""
4
2
5
3
import numpy as np
6
4
from numpy .testing import assert_array_equal , assert_array_less
7
5
from numpy .testing import assert_array_almost_equal
8
6
from numpy .testing import assert_equal
9
7
from nose .tools import assert_raises
10
8
9
+ from sklearn .cross_validation import train_test_split
11
10
from sklearn .grid_search import GridSearchCV
12
11
from sklearn .ensemble import AdaBoostClassifier
13
12
from sklearn .ensemble import AdaBoostRegressor
13
+ from scipy .sparse import csc_matrix
14
+ from scipy .sparse import csr_matrix
15
+ from scipy .sparse import coo_matrix
16
+ from scipy .sparse import dok_matrix
17
+ from scipy .sparse import lil_matrix
18
+ from sklearn .svm import SVC , SVR
14
19
from sklearn .tree import DecisionTreeClassifier , DecisionTreeRegressor
15
20
from sklearn .utils import shuffle
16
21
from sklearn import datasets
22
+ import time
17
23
18
24
19
25
# Common random state
@@ -215,6 +221,10 @@ def test_error():
215
221
AdaBoostClassifier (algorithm = "foo" ).fit ,
216
222
X , y_class )
217
223
224
+ assert_raises (ValueError ,
225
+ AdaBoostClassifier ().fit ,
226
+ X , y_class , sample_weight = np .asarray ([- 1 ]))
227
+
218
228
219
229
def test_base_estimator ():
220
230
"""Test different base estimators."""
@@ -239,6 +249,158 @@ def test_base_estimator():
239
249
clf .fit (X , y_regr )
240
250
241
251
252
+ def test_sparse_classification ():
253
+ """Check classification with sparse input."""
254
+
255
+ class CustomSVC (SVC ):
256
+ """SVC variant that records the nature of the training set."""
257
+
258
+ def fit (self , X , y , sample_weight = None ):
259
+ """Modification on fit caries data type for later verification."""
260
+ super (CustomSVC , self ).fit (X , y , sample_weight = sample_weight )
261
+ self .data_type_ = type (X )
262
+ return self
263
+
264
+ X , y = datasets .make_multilabel_classification (n_classes = 1 , n_samples = 100 ,
265
+ n_features = 50 ,
266
+ return_indicator = True ,
267
+ random_state = 42 )
268
+ # Flatten y to a 1d array
269
+ y = np .ravel (y )
270
+
271
+ X_train , X_test , y_train , y_test = train_test_split (X , y , random_state = 0 )
272
+
273
+ for sparse_format in [csc_matrix , csr_matrix , lil_matrix , coo_matrix ,
274
+ dok_matrix ]:
275
+ X_train_sparse = sparse_format (X_train )
276
+ X_test_sparse = sparse_format (X_test )
277
+
278
+ # Trained on sparse format
279
+ sparse_classifier = AdaBoostClassifier (
280
+ base_estimator = CustomSVC (probability = True ),
281
+ random_state = 1 ,
282
+ algorithm = "SAMME"
283
+ ).fit (X_train_sparse , y_train )
284
+
285
+ # Trained on dense format
286
+ dense_classifier = AdaBoostClassifier (
287
+ base_estimator = CustomSVC (probability = True ),
288
+ random_state = 1 ,
289
+ algorithm = "SAMME"
290
+ ).fit (X_train , y_train )
291
+
292
+ # predict
293
+ sparse_results = sparse_classifier .predict (X_test_sparse )
294
+ dense_results = dense_classifier .predict (X_test )
295
+ assert_array_equal (sparse_results , dense_results )
296
+ sparse_y_pred , dense_y_pred = sparse_results , dense_results
297
+
298
+ # decision_function
299
+ sparse_results = sparse_classifier .decision_function (X_test_sparse )
300
+ dense_results = dense_classifier .decision_function (X_test )
301
+ assert_array_equal (sparse_results , dense_results )
302
+
303
+ # predict_log_proba
304
+ sparse_results = sparse_classifier .predict_log_proba (X_test_sparse )
305
+ dense_results = dense_classifier .predict_log_proba (X_test )
306
+ assert_array_equal (sparse_results , dense_results )
307
+
308
+ # predict_proba
309
+ sparse_results = sparse_classifier .predict_proba (X_test_sparse )
310
+ dense_results = dense_classifier .predict_proba (X_test )
311
+ assert_array_equal (sparse_results , dense_results )
312
+
313
+ # score
314
+ sparse_results = sparse_classifier .score (X_test_sparse , y_test )
315
+ dense_results = dense_classifier .score (X_test , y_test )
316
+ assert_array_equal (sparse_results , dense_results )
317
+
318
+ # staged_decision_function
319
+ sparse_results = sparse_classifier .staged_decision_function (
320
+ X_test_sparse )
321
+ dense_results = dense_classifier .staged_decision_function (X_test )
322
+ for sprase_res , dense_res in zip (sparse_results , dense_results ):
323
+ assert_array_equal (sprase_res , dense_res )
324
+
325
+ # staged_predict
326
+ sparse_results = sparse_classifier .staged_predict (X_test_sparse )
327
+ dense_results = dense_classifier .staged_predict (X_test )
328
+ for sprase_res , dense_res in zip (sparse_results , dense_results ):
329
+ assert_array_equal (sprase_res , dense_res )
330
+
331
+ # staged_predict_proba
332
+ sparse_results = sparse_classifier .staged_predict_proba (X_test_sparse )
333
+ dense_results = dense_classifier .staged_predict_proba (X_test )
334
+ for sprase_res , dense_res in zip (sparse_results , dense_results ):
335
+ assert_array_equal (sprase_res , dense_res )
336
+
337
+ # staged_score
338
+ sparse_results = sparse_classifier .staged_score (X_test_sparse ,
339
+ y_test )
340
+ dense_results = dense_classifier .staged_score (X_test , y_test )
341
+ for sprase_res , dense_res in zip (sparse_results , dense_results ):
342
+ assert_array_equal (sprase_res , dense_res )
343
+
344
+ # Verify sparsity of data is maintained during training
345
+ sparse_type = type (X_train_sparse )
346
+ types = [i .data_type_ for i in sparse_classifier .estimators_ ]
347
+
348
+ assert all ([(t == csc_matrix or t == csr_matrix )
349
+ for t in types ])
350
+
351
+
352
+ def test_sparse_regression ():
353
+ """Check regression with sparse input."""
354
+
355
+ class CustomSVR (SVR ):
356
+ """SVR variant that records the nature of the training set."""
357
+
358
+ def fit (self , X , y , sample_weight = None ):
359
+ """Modification on fit caries data type for later verification."""
360
+ super (CustomSVR , self ).fit (X , y , sample_weight = sample_weight )
361
+ self .data_type_ = type (X )
362
+ return self
363
+
364
+ X , y = datasets .make_regression (n_samples = 100 , n_features = 50 , n_targets = 1 ,
365
+ random_state = 42 )
366
+
367
+ X_train , X_test , y_train , y_test = train_test_split (X , y , random_state = 0 )
368
+
369
+ for sparse_format in [csc_matrix , csr_matrix , lil_matrix , coo_matrix ,
370
+ dok_matrix ]:
371
+ X_train_sparse = sparse_format (X_train )
372
+ X_test_sparse = sparse_format (X_test )
373
+
374
+ # Trained on sparse format
375
+ sparse_classifier = AdaBoostRegressor (
376
+ base_estimator = CustomSVR (probability = True ),
377
+ random_state = 1
378
+ ).fit (X_train_sparse , y_train )
379
+
380
+ # Trained on dense format
381
+ dense_classifier = dense_results = AdaBoostRegressor (
382
+ base_estimator = CustomSVR (probability = True ),
383
+ random_state = 1
384
+ ).fit (X_train , y_train )
385
+
386
+ # predict
387
+ sparse_results = sparse_classifier .predict (X_test_sparse )
388
+ dense_results = dense_classifier .predict (X_test )
389
+ assert_array_equal (sparse_results , dense_results )
390
+
391
+ # staged_predict
392
+ sparse_results = sparse_classifier .staged_predict (X_test_sparse )
393
+ dense_results = dense_classifier .staged_predict (X_test )
394
+ for sprase_res , dense_res in zip (sparse_results , dense_results ):
395
+ assert_array_equal (sprase_res , dense_res )
396
+
397
+ sparse_type = type (X_train_sparse )
398
+ types = [i .data_type_ for i in sparse_classifier .estimators_ ]
399
+
400
+ assert all ([(t == csc_matrix or t == csr_matrix )
401
+ for t in types ])
402
+
403
+
242
404
if __name__ == "__main__" :
243
405
import nose
244
406
nose .runmodule ()
0 commit comments