diff --git a/configs/README.md b/configs/README.md index 8d3c5ac2..07c92dc1 100644 --- a/configs/README.md +++ b/configs/README.md @@ -117,6 +117,7 @@ Configs have the three highest parameter keys: |:---------------|:--------------|:--------|:------------| | `algorithm`:`estimator` | None | | Name of measured estimator. | | `algorithm`:`estimator_params` | Empty `dict` | | Parameters for estimator constructor. | +| `algorithm`:`training`:`num_batches` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. | | `algorithm`:`online_inference_mode` | False | | Enables online mode for inference methods of estimator (separate call for each sample). | | `algorithm`:`sklearn_context` | None | | Parameters for sklearn `config_context` used over estimator. | | `algorithm`:`sklearnex_context` | None | | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. | diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json new file mode 100644 index 00000000..37b2c7fb --- /dev/null +++ b/configs/sklearnex_incremental_example.json @@ -0,0 +1,65 @@ +{ + "PARAMETERS_SETS": { + "common": {"bench": {"n_runs": 10, "time_limit": 60}}, + "unlabeled dataset": { + "data": [ + { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1000, + "n_features": [16, 64] + }, + "split_kwargs": {"ignore": true} + } + ] + }, + "labeled dataset": { + "data": { + "source": "make_regression", + "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, + "generation_kwargs": { + "n_samples": 5000, + "n_features": [40, 100], + "n_informative": 5, + "noise": 2.0 + } + } + }, + "covariance": { + "algorithm": [ + { + "estimator": "IncrementalEmpiricalCovariance", + "library": "sklearnex.covariance", + "estimator_methods": {"training": "partial_fit"}, + "num_batches": {"training": 2} + } + ] + }, + "linear_regression": { + "algorithm": [ + { + "estimator": "IncrementalLinearRegression", + "library": "sklearnex", + "num_batches": {"training": 2} + } + ] + }, + "pca": { + "algorithm": [ + { + "estimator": "IncrementalPCA", + "library": "sklearnex.preview", + "num_batches": {"training": 2} + } + ] + } + }, + "TEMPLATES": { + "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, + "linear_regression": { + "SETS": ["common", "linear_regression", "labeled dataset"] + }, + "pca": {"SETS": ["common", "pca", "unlabeled dataset"]} + } +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index f9c0a75e..dd0ef1a5 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str): def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]: # default estimator methods estimator_methods = { - "training": ["fit"], + "training": ["partial_fit", "fit"], "inference": ["predict", "predict_proba", "transform"], } for stage in estimator_methods.keys(): @@ -334,34 +334,35 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: return acceleration_lines > 0 and fallback_lines == 0 -def create_online_function(method_instance, data_args, batch_size): - n_batches = data_args[0].shape[0] // batch_size +def create_online_function(estimator_instance, method_instance, data_args, num_batches): if "y" in list(inspect.signature(method_instance).parameters): def ndarray_function(x, y): - for i in range(n_batches): - method_instance( - x[i * batch_size : (i + 1) * batch_size], - y[i * batch_size : (i + 1) * batch_size], - ) + for i in range(num_batches): + method_instance(x, y) + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): - for i in range(n_batches): - method_instance( - x.iloc[i * batch_size : (i + 1) * batch_size], - y.iloc[i * batch_size : (i + 1) * batch_size], - ) + for i in range(num_batches): + method_instance(x, y) + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() else: def ndarray_function(x): - for i in range(n_batches): - method_instance(x[i * batch_size : (i + 1) * batch_size]) + for i in range(num_batches): + method_instance(x) + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() def dataframe_function(x): - for i in range(n_batches): - method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) + for i in range(num_batches): + method_instance(x) + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() if "ndarray" in str(type(data_args[0])): return ndarray_function @@ -417,9 +418,17 @@ def measure_sklearn_estimator( batch_size = get_bench_case_value( bench_case, f"algorithm:batch_size:{stage}" ) - if batch_size is not None: + + if method == "partial_fit": + num_batches = get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}", 5 + ) + method_instance = create_online_function( - method_instance, data_args, batch_size + estimator_instance, + method_instance, + data_args, + num_batches ) # daal4py model builders enabling branch if enable_modelbuilders and stage == "inference": @@ -536,9 +545,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): for stage in estimator_methods.keys(): data_descs[stage].update( { - "batch_size": get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" - ) + key: val + for key, val in { + "batch_size": get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ), + "num_batches": get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}" + ) + }.items() + if val is not None } ) if "n_classes" in data_description: diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index 28fa2bb0..689396f1 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -16,7 +16,7 @@ import argparse import json -from typing import Dict, List +from typing import Dict, Hashable, List import openpyxl as xl import pandas as pd @@ -94,6 +94,7 @@ "order", "n_classes", "n_clusters", + "num_batches", "batch_size", ] @@ -239,6 +240,7 @@ def get_result_tables_as_df( bench_cases = pd.DataFrame( [flatten_dict(bench_case) for bench_case in results["bench_cases"]] ) + bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x) if compatibility_mode: bench_cases = transform_results_to_compatible(bench_cases) @@ -248,7 +250,7 @@ def get_result_tables_as_df( bench_cases.drop(columns=[column], inplace=True) diffby_columns.remove(column) - return split_df_by_columns(bench_cases, splitby_columns) + return split_df_by_columns(bench_cases, splitby_columns, False) def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: @@ -258,7 +260,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: # only relative improvements are included in summary currently if len(column) > 1 and column[1] == f"{metric_name} relative improvement": metric_columns.append(column) - summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + if metric_columns: + summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + else: + summary = pd.DataFrame() summary.index = pd.Index([df_name]) return summary diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml index a37769ce..d8c1a64e 100644 --- a/test-configuration-linux.yml +++ b/test-configuration-linux.yml @@ -45,6 +45,11 @@ steps: conda activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/sklearnex_incremental_example.json + displayName: Incremental algorithms example run - script: | source /usr/share/miniconda/etc/profile.d/conda.sh conda activate bench-env diff --git a/test-configuration-win.yml b/test-configuration-win.yml index a1eddaeb..f3ac1595 100644 --- a/test-configuration-win.yml +++ b/test-configuration-win.yml @@ -43,6 +43,10 @@ steps: call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run + - script: | + call activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/incremental.json + displayName: Incremental algorithms example run - script: | call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: