From e332b3fd021310dfff767cd569850bf539985282 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Wed, 19 Feb 2025 14:34:52 +0100 Subject: [PATCH 1/4] Create function to obtain the khiops_env script file name --- khiops/core/internals/runner.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index a9c426d6..c66dccf4 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -120,11 +120,17 @@ def _extract_path_from_uri(uri): return path +def _get_khiops_env_script_file_name(): + """Returns the file name for the khiops_env script in a portable way""" + khiops_env_file_name = "khiops_env" + if platform.system() == "Windows": + khiops_env_file_name += ".cmd" + return khiops_env_file_name + + def _khiops_env_file_exists(env_dir): """Check ``khiops_env`` exists relative to the specified environment dir""" - khiops_env_path = os.path.join(env_dir, "khiops_env") - if platform.system() == "Windows": - khiops_env_path += ".cmd" + khiops_env_path = os.path.join(env_dir, _get_khiops_env_script_file_name()) return os.path.exists(khiops_env_path) and os.path.isfile(khiops_env_path) From 3c970ff70706ad9146a5a2d01d94e516b9be5e69 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Wed, 19 Feb 2025 14:48:34 +0100 Subject: [PATCH 2/4] Render dynamic the Khiops execution command --- doc/samples/samples.rst | 473 ++++++++---------------- doc/samples/samples_sklearn.rst | 171 ++++++++- khiops/core/internals/runner.py | 263 +++++++------- khiops/samples/samples.ipynb | 518 +++++++++------------------ khiops/samples/samples_sklearn.ipynb | 197 +++++++++- tests/test_khiops_integrations.py | 21 +- 6 files changed, 804 insertions(+), 839 deletions(-) diff --git a/doc/samples/samples.rst b/doc/samples/samples.rst index 2cc8a57a..db67f39e 100644 --- a/doc/samples/samples.rst +++ b/doc/samples/samples.rst @@ -66,12 +66,12 @@ Samples # Set the file paths data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - output_dir = os.path.join("kh_samples", "detect_data_table_format") - transformed_data_table_path = os.path.join(output_dir, "AdultWithAnotherFormat.txt") + results_dir = os.path.join("kh_samples", "detect_data_table_format") + transformed_data_table_path = os.path.join(results_dir, "AdultWithAnotherFormat.txt") # Create the output directory - if not os.path.isdir(output_dir): - os.mkdir(output_dir) + if not os.path.isdir(results_dir): + os.mkdir(results_dir) # Detect the format of the table format_spec = kh.detect_data_table_format(data_table_path) @@ -128,10 +128,10 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - output_dir = os.path.join("kh_samples", "export_dictionary_file") - output_dictionary_file_path = os.path.join(output_dir, "ModifiedAdult.kdic") - output_dictionary_json_path = os.path.join(output_dir, "ModifiedAdult.kdicj") - alt_output_dictionary_json_path = os.path.join(output_dir, "AltModifiedAdult.kdicj") + results_dir = os.path.join("kh_samples", "export_dictionary_file") + output_dictionary_file_path = os.path.join(results_dir, "ModifiedAdult.kdic") + output_dictionary_json_path = os.path.join(results_dir, "ModifiedAdult.kdicj") + alt_output_dictionary_json_path = os.path.join(results_dir, "AltModifiedAdult.kdicj") # Load the dictionary domain from initial dictionary file # Then obtain the "Adult" dictionary within @@ -147,10 +147,10 @@ Samples # Create output directory if necessary if not os.path.exists("kh_samples"): os.mkdir("kh_samples") - os.mkdir(output_dir) + os.mkdir(results_dir) else: - if not os.path.exists(output_dir): - os.mkdir(output_dir) + if not os.path.exists(results_dir): + os.mkdir(results_dir) # Export to kdic domain.export_khiops_dictionary_file(output_dictionary_file_path) @@ -171,9 +171,7 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - analysis_report_file_path = os.path.join( - "kh_samples", "train_predictor", "AnalysisReport.khj" - ) + results_dir = os.path.join("kh_samples", "train_predictor") # Train the predictor kh.train_predictor( @@ -181,7 +179,7 @@ Samples "Adult", data_table_path, "class", - analysis_report_file_path, + results_dir, max_trees=0, ) .. autofunction:: train_predictor_file_paths @@ -194,17 +192,15 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - report_file_path = os.path.join( - "kh_samples", "train_predictor_file_paths", "AnalysisResults.khj" - ) + results_dir = os.path.join("kh_samples", "train_predictor_file_paths") # Train the predictor - _, modeling_dictionary_file_path = kh.train_predictor( + report_file_path, modeling_dictionary_file_path = kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", - report_file_path, + results_dir, max_trees=0, ) print("Reports file available at " + report_file_path) @@ -222,10 +218,9 @@ Samples # Set the file paths with a nonexistent dictionary file dictionary_file_path = "NONEXISTENT_DICTIONARY_FILE.kdic" data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "train_predictor_error_handling") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - log_file_path = os.path.join(output_dir, "khiops.log") - scenario_path = os.path.join(output_dir, "scenario._kh") + results_dir = os.path.join("kh_samples", "train_predictor_error_handling") + log_file_path = os.path.join(results_dir, "khiops.log") + scenario_path = os.path.join(results_dir, "scenario._kh") # Train the predictor and handle the error try: @@ -234,7 +229,7 @@ Samples "Adult", data_table_path, "class", - report_file_path, + results_dir, trace=True, log_file_path=log_file_path, output_scenario_path=scenario_path, @@ -266,9 +261,7 @@ Samples dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") - report_file_path = os.path.join( - "kh_samples", "train_predictor_mt", "AnalysisResults.khj" - ) + results_dir = os.path.join("kh_samples", "train_predictor_mt") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables @@ -279,8 +272,8 @@ Samples "Accident", accidents_table_path, "Gravity", - report_file_path, - additional_data_tables={"Vehicles": vehicles_table_path}, + results_dir, + additional_data_tables={"Accident`Vehicles": vehicles_table_path}, max_trees=0, ) .. autofunction:: train_predictor_mt_with_specific_rules @@ -295,11 +288,7 @@ Samples dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") - report_file_path = os.path.join( - "kh_samples", - "train_predictor_mt_with_specific_rules", - "AnalysisResults.khj", - ) + results_dir = os.path.join("kh_samples", "train_predictor_mt_with_specific_rules") # Train the predictor. Besides the mandatory parameters, it is specified: # - A python dictionary linking data paths to file paths for non-root tables @@ -311,8 +300,8 @@ Samples "Accident", accidents_table_path, "Gravity", - report_file_path, - additional_data_tables={"Vehicles": vehicles_table_path}, + results_dir, + additional_data_tables={"Accident`Vehicles": vehicles_table_path}, max_constructed_variables=1000, construction_rules=["TableMode", "TableSelection"], max_trees=0, @@ -331,9 +320,7 @@ Samples vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") users_table_path = os.path.join(accidents_dir, "Users.txt") places_table_path = os.path.join(accidents_dir, "Places.txt") - report_file_path = os.path.join( - "kh_samples", "train_predictor_mt_snowflake", "AnalysisResults.khj" - ) + results_dir = os.path.join("kh_samples", "train_predictor_mt_snowflake") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables @@ -344,11 +331,11 @@ Samples "Accident", accidents_table_path, "Gravity", - report_file_path, + results_dir, additional_data_tables={ - "Vehicles": vehicles_table_path, - "Vehicles/Users": users_table_path, - "Place": places_table_path, + "Accident`Vehicles": vehicles_table_path, + "Accident`Vehicles`Users": users_table_path, + "Accident`Place": places_table_path, }, max_trees=0, ) @@ -362,11 +349,7 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - report_file_path = os.path.join( - "kh_samples", - "train_predictor_with_train_percentage", - "P90_AnalysisResults.khj", - ) + results_dir = os.path.join("kh_samples", "train_predictor_with_train_percentage") # Train the predictor. Besides the mandatory parameters, it is specified: # - A 90% sampling rate for the training dataset @@ -377,10 +360,11 @@ Samples "Adult", data_table_path, "class", - report_file_path, + results_dir, sample_percentage=90, use_complement_as_test=True, max_trees=0, + results_prefix="P90_", ) .. autofunction:: train_predictor_with_trees .. code-block:: python @@ -392,9 +376,7 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.txt") - report_file_path = os.path.join( - "kh_samples", "train_predictor_with_trees", "P80_AnalysisResults.khj" - ) + results_dir = os.path.join("kh_samples", "train_predictor_with_trees") # Train the predictor with at most 15 trees (default 10) kh.train_predictor( @@ -402,9 +384,10 @@ Samples "Letter", data_table_path, "lettr", - report_file_path, + results_dir, sample_percentage=80, use_complement_as_test=True, + results_prefix="P80_", max_trees=15, ) .. autofunction:: train_predictor_with_pairs @@ -417,9 +400,7 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - report_file_path = os.path.join( - "kh_samples", "train_predictor_with_pairs", "AnalysisResults.khj" - ) + results_dir = os.path.join("kh_samples", "train_predictor_with_pairs") # Train the predictor with at most 10 pairs as follows: # - Include pairs age-race and capital_gain-capital_loss @@ -429,7 +410,7 @@ Samples "Adult", data_table_path, "class", - report_file_path, + results_dir, use_complement_as_test=True, max_trees=0, max_pairs=10, @@ -449,10 +430,9 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "train_predictor_with_multiple_parameters") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - output_script_path = os.path.join(output_dir, "output_scenario._kh") - log_path = os.path.join(output_dir, "log.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_multiple_parameters") + output_script_path = os.path.join(results_dir, "output_scenario._kh") + log_path = os.path.join(results_dir, "log.txt") # Train the predictor. Besides the mandatory parameters, we specify: # - The value "more" as main target value @@ -465,7 +445,7 @@ Samples "Adult", data_table_path, "class", - report_file_path, + results_dir, main_target_value="more", output_scenario_path=output_script_path, log_file_path=log_path, @@ -482,9 +462,8 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - output_dir = os.path.join("kh_samples", "train_predictor_detect_format") - transformed_data_table_path = os.path.join(output_dir, "TransformedIris.txt") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") + results_dir = os.path.join("kh_samples", "train_predictor_detect_format") + transformed_data_table_path = os.path.join(results_dir, "TransformedIris.txt") # Transform the database format from header_line=True and field_separator=TAB # to header_line=False and field_separator="," @@ -505,7 +484,7 @@ Samples "Iris", transformed_data_table_path, "Class", - report_file_path, + results_dir, header_line=True, field_separator="", ) @@ -522,7 +501,7 @@ Samples "Iris", transformed_data_table_path, "Class", - report_file_path, + results_dir, ) .. autofunction:: train_predictor_with_cross_validation .. code-block:: python @@ -535,12 +514,12 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "train_predictor_with_cross_validation") - fold_dictionary_file_path = os.path.join(output_dir, "AdultWithFolding.kdic") + results_dir = os.path.join("kh_samples", "train_predictor_with_cross_validation") + fold_dictionary_file_path = os.path.join(results_dir, "AdultWithFolding.kdic") # Create the output directory - if not os.path.isdir(output_dir): - os.mkdir(output_dir) + if not os.path.isdir(results_dir): + os.mkdir(results_dir) # Load the learning dictionary object domain = kh.read_dictionary_file(dictionary_file_path) @@ -577,39 +556,35 @@ Samples train_aucs = [] test_aucs = [] for fold_index in range(1, fold_number + 1): - analysis_report_file_path = os.path.join( - output_dir, "Fold" + str(fold_index) + "AnalysisResults.khj" - ) # Train a model from the sub-dataset where IsInTrainDataset is 1 - _, modeling_dictionary_file_path = kh.train_predictor( + train_reports_path, modeling_dictionary_file_path = kh.train_predictor( domain, "Adult", data_table_path, "class", - analysis_report_file_path, + results_dir, sample_percentage=100, selection_variable="IsInTrainDataset" + str(fold_index), selection_value=1, max_trees=0, + results_prefix="Fold" + str(fold_index), ) - evaluation_report_file_path = os.path.join( - output_dir, "Fold" + str(fold_index) + "AdultEvaluationResults.khj" - ) # Evaluate the resulting model in the subsets where IsInTrainDataset is 0 test_evaluation_report_path = kh.evaluate_predictor( modeling_dictionary_file_path, - "SNB_Adult", + "Adult", data_table_path, - evaluation_report_file_path, + results_dir, sample_percentage=100, selection_variable="IsInTrainDataset" + str(fold_index), selection_value=0, + results_prefix="Fold" + str(fold_index), ) # Obtain the train AUC from the train report and the test AUC from the # evaluation report and print them - train_results = kh.read_analysis_results_file(analysis_report_file_path) + train_results = kh.read_analysis_results_file(train_reports_path) test_evaluation_results = kh.read_analysis_results_file(test_evaluation_report_path) train_auc = train_results.train_evaluation_report.get_snb_performance().auc test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc @@ -631,32 +606,6 @@ Samples print("final auc") print("train auc: " + str(mean_train_auc) + " +- " + str(sd_train_auc)) print("test auc: " + str(mean_test_auc) + " +- " + str(sd_test_auc)) -.. autofunction:: interpret_predictor -.. code-block:: python - - # Imports - import os - from khiops import core as kh - - dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "interpret_predictor") - analysis_report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - interpretor_file_path = os.path.join(output_dir, "InterpretationModel.kdic") - - # Build prediction model - _, predictor_file_path = kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - analysis_report_file_path, - ) - - # Build interpretation model - kh.interpret_predictor(predictor_file_path, "SNB_Adult", interpretor_file_path) - - print(f"The interpretation model is '{interpretor_file_path}'") .. autofunction:: multiple_train_predictor .. code-block:: python @@ -683,8 +632,7 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "multiple_train_predictor") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") + results_dir = os.path.join("kh_samples", "multiple_train_predictor") # Read the dictionary file to obtain an instance of class Dictionary dictionary_domain = kh.read_dictionary_file(dictionary_file_path) @@ -692,20 +640,20 @@ Samples # Train a SNB model using all the variables print("\t#vars\ttrain auc\ttest auc") - kh.train_predictor( + json_result_file_path, _ = kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", - report_file_path, + results_dir, sample_percentage=70, use_complement_as_test=True, max_trees=0, ) - display_test_results(report_file_path) + display_test_results(json_result_file_path) # Read results to obtain the variables sorted by decreasing Level - analysis_results = kh.read_analysis_results_file(report_file_path) + analysis_results = kh.read_analysis_results_file(json_result_file_path) preparation_results = analysis_results.preparation_report # Train a sequence of models with a decreasing number of variables @@ -719,22 +667,21 @@ Samples dictionary.get_variable(variable.name).used = False # Train the model with this dictionary domain object - report_file_path = os.path.join( - output_dir, f"V{variable_number - 1 - i}_AnalysisResults.khj" - ) - kh.train_predictor( + prefix = f"V{variable_number - 1 - i}_" + json_result_file_path, _ = kh.train_predictor( dictionary_domain, "Adult", data_table_path, "class", - report_file_path, + results_dir, sample_percentage=70, use_complement_as_test=True, + results_prefix=prefix, max_trees=0, ) # Show a preview of the results - display_test_results(report_file_path) + display_test_results(json_result_file_path) .. autofunction:: evaluate_predictor .. code-block:: python @@ -745,29 +692,24 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "evaluate_predictor") - analysis_report_file_path = os.path.join(output_dir, "AnalysisResults.khj") + results_dir = os.path.join("kh_samples", "evaluate_predictor") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") # Train the predictor - _, model_dictionary_file_path = kh.train_predictor( + kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", - analysis_report_file_path, + results_dir, max_trees=0, ) - evaluation_report_file_path = os.path.join(output_dir, "AdultEvaluationResults.khj") - # Evaluate the predictor - kh.evaluate_predictor( - model_dictionary_file_path, - "SNB_Adult", - data_table_path, - evaluation_report_file_path, + report_file_path = kh.evaluate_predictor( + model_dictionary_file_path, "Adult", data_table_path, results_dir ) - print("Evaluation report available at " + evaluation_report_file_path) + print("Evaluation report available at " + report_file_path) .. autofunction:: access_predictor_evaluation_report .. code-block:: python @@ -778,9 +720,8 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - report_file_path = os.path.join( - "kh_samples", "access_predictor_evaluation_report", "AdultAnalysisReport.khj" - ) + results_dir = os.path.join("kh_samples", "access_predictor_evaluation_report") + evaluation_report_path = os.path.join(results_dir, "AllReports.khj") # Train the SNB predictor and some univariate predictors # Note: Evaluation in test is 30% by default @@ -789,12 +730,13 @@ Samples "Adult", data_table_path, "class", - report_file_path, + results_dir, max_trees=0, + univariate_predictor_number=4, ) # Obtain the evaluation results - results = kh.read_analysis_results_file(report_file_path) + results = kh.read_analysis_results_file(evaluation_report_path) evaluation_report = results.test_evaluation_report snb_performance = evaluation_report.get_snb_performance() @@ -834,9 +776,9 @@ Samples + str(random_lift_curve.values[i]) ) - # Print metrics for an SNB predictor + # Print univariate metrics for an univariate predictor predictor_performance = evaluation_report.get_predictor_performance( - "Selective Naive Bayes" + "Univariate relationship" ) print("\n\nperformance metrics for " + predictor_performance.name) for metric_name in predictor_performance.get_metric_names(): @@ -851,12 +793,10 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - report_file_path = os.path.join("kh_samples", "train_recoder", "AnalysisResults.khj") + results_dir = os.path.join("kh_samples", "train_recoder") # Train the recoder model - kh.train_recoder( - dictionary_file_path, "Adult", data_table_path, "class", report_file_path - ) + kh.train_recoder(dictionary_file_path, "Adult", data_table_path, "class", results_dir) .. autofunction:: train_recoder_with_multiple_parameters .. code-block:: python @@ -867,11 +807,7 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - report_file_path = os.path.join( - "kh_samples", - "train_recoder_with_multiple_parameters", - "AnalysisResults.khj", - ) + results_dir = os.path.join("kh_samples", "train_recoder_with_multiple_parameters") # Train the recoder model kh.train_recoder( @@ -879,7 +815,7 @@ Samples "Adult", data_table_path, "class", - report_file_path, + results_dir, max_pairs=10, categorical_recoding_method="part label", numerical_recoding_method="part label", @@ -896,9 +832,7 @@ Samples dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") - report_file_path = os.path.join( - "kh_samples", "train_recoder_mt_flatten", "AnalysisResults.khj" - ) + results_dir = os.path.join("kh_samples", "train_recoder_mt_flatten") # Train the recoder. Besides the mandatory parameters, it is specified: # - A python dictionary linking data paths to file paths for non-root tables @@ -910,8 +844,8 @@ Samples "Accident", accidents_table_path, "Gravity", - report_file_path, - additional_data_tables={"Vehicles": vehicles_table_path}, + results_dir, + additional_data_tables={"Accident`Vehicles": vehicles_table_path}, max_constructed_variables=1000, informative_variables_only=False, categorical_recoding_method="none", @@ -929,17 +863,17 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "deploy_model") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt") + results_dir = os.path.join("kh_samples", "deploy_model") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the predictor - _, model_dictionary_file_path = kh.train_predictor( + kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", - report_file_path, + results_dir, max_trees=0, ) @@ -960,18 +894,18 @@ Samples dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") - output_dir = os.path.join("kh_samples", "deploy_model_mt") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - output_data_table_path = os.path.join(output_dir, "TransferredAccidents.txt") + results_dir = os.path.join("kh_samples", "deploy_model_mt") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "TransferredAccidents.txt") # Train the predictor (see train_predictor_mt for details) - _, model_dictionary_file_path = kh.train_predictor( + kh.train_predictor( dictionary_file_path, "Accident", accidents_table_path, "Gravity", - report_file_path, - additional_data_tables={"Vehicles": vehicles_table_path}, + results_dir, + additional_data_tables={"Accident`Vehicles": vehicles_table_path}, max_trees=0, ) @@ -983,56 +917,7 @@ Samples "SNB_Accident", accidents_table_path, output_data_table_path, - additional_data_tables={"Vehicles": vehicles_table_path}, - ) -.. autofunction:: deploy_model_mt_with_interpretation -.. code-block:: python - - # Imports - import os - from khiops import core as kh - - # Set the file paths - accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") - output_dir = os.path.join("kh_samples", "deploy_model_mt") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - interpretor_file_path = os.path.join(output_dir, "InterpretationModel.kdic") - output_data_table_path = os.path.join(output_dir, "InterpretedAccidents.txt") - - # Train the predictor (see train_predictor_mt for details) - # Add max_evaluated_variables so that an interpretation model can be built - # (see https://github.com/KhiopsML/khiops/issues/577) - _, model_dictionary_file_path = kh.train_predictor( - dictionary_file_path, - "Accident", - accidents_table_path, - "Gravity", - report_file_path, - additional_data_tables={"Vehicles": vehicles_table_path}, - max_trees=0, - max_evaluated_variables=10, - ) - - # Interpret the predictor - kh.interpret_predictor( - model_dictionary_file_path, - "SNB_Accident", - interpretor_file_path, - reinforcement_target_value="NonLethal", - ) - - # Deploy the interpretation model on the database - # Besides the mandatory parameters, it is specified: - # - A python dictionary linking data paths to file paths for non-root tables - kh.deploy_model( - interpretor_file_path, - "Interpretation_SNB_Accident", - accidents_table_path, - output_data_table_path, - additional_data_tables={"Vehicles": vehicles_table_path}, + additional_data_tables={"SNB_Accident`Vehicles": vehicles_table_path}, ) .. autofunction:: deploy_model_mt_snowflake .. code-block:: python @@ -1048,24 +933,24 @@ Samples vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") users_table_path = os.path.join(accidents_dir, "Users.txt") places_table_path = os.path.join(accidents_dir, "Places.txt") - output_dir = os.path.join("kh_samples", "deploy_model_mt_snowflake") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - output_data_table_path = os.path.join(output_dir, "TransferredAccidents.txt") + results_dir = os.path.join("kh_samples", "deploy_model_mt_snowflake") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "TransferredAccidents.txt") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables # - To not construct any decision tree # The default number of automatic features is 100 - _, model_dictionary_file_path = kh.train_predictor( + kh.train_predictor( dictionary_file_path, "Accident", accidents_table_path, "Gravity", - report_file_path, + results_dir, additional_data_tables={ - "Vehicles": vehicles_table_path, - "Vehicles/Users": users_table_path, - "Place": places_table_path, + "Accident`Vehicles": vehicles_table_path, + "Accident`Vehicles`Users": users_table_path, + "Accident`Place": places_table_path, }, max_trees=0, ) @@ -1079,9 +964,9 @@ Samples accidents_table_path, output_data_table_path, additional_data_tables={ - "Vehicles": vehicles_table_path, - "Vehicles/Users": users_table_path, - "Place": places_table_path, + "SNB_Accident`Vehicles": vehicles_table_path, + "SNB_Accident`Vehicles`Users": users_table_path, + "SNB_Accident`Place": places_table_path, }, ) .. autofunction:: deploy_model_expert @@ -1094,17 +979,17 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "deploy_model_expert") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt") + results_dir = os.path.join("kh_samples", "deploy_model_expert") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the predictor - _, model_dictionary_file_path = kh.train_predictor( + kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", - report_file_path, + results_dir, max_trees=0, ) @@ -1135,9 +1020,8 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "deploy_classifier_for_metrics") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt") + results_dir = os.path.join("kh_samples", "deploy_classifier_for_metrics") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the classifier for the target "class" _, modeling_dictionary_file_path = kh.train_predictor( @@ -1145,9 +1029,10 @@ Samples "Adult", data_table_path, "class", - report_file_path, + results_dir, max_trees=0, ) + # Obtain the scores of the SNB on the test dataset to calculate the PR curve kh.deploy_predictor_for_metrics( modeling_dictionary_file_path, @@ -1202,9 +1087,8 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - output_dir = os.path.join("kh_samples", "deploy_regressor_for_metrics") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") - output_data_table_path = os.path.join(output_dir, "TrueAndPredictedAges.txt") + results_dir = os.path.join("kh_samples", "deploy_regressor_for_metrics") + output_data_table_path = os.path.join(results_dir, "TrueAndPredictedAges.txt") # Train the regressor for the target "age" (with 20% train to be quick) _, modeling_dictionary_file_path = kh.train_predictor( @@ -1212,7 +1096,7 @@ Samples "Adult", data_table_path, "age", - report_file_path, + results_dir, sample_percentage=20, max_trees=0, ) @@ -1335,51 +1219,18 @@ Samples splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") - coclustering_report_path = os.path.join( - "kh_samples", "train_coclustering", "CoclusteringResults.khcj" - ) + results_dir = os.path.join("kh_samples", "train_coclustering") # Train a coclustering model for variables "SampleId" and "Char" - kh.train_coclustering( + coclustering_report_path = kh.train_coclustering( dictionary_file_path, "SpliceJunctionDNA", data_table_path, ["SampleId", "Char"], - coclustering_report_path, + results_dir, ) print(f"Coclustering report file available at {coclustering_report_path}") - # If you have Khiops Co-Visualization installed you may open the report as follows - # kh.visualize_report(coclustering_report_path) -.. autofunction:: train_instance_variable_coclustering -.. code-block:: python - - # Imports - import os - from khiops import core as kh - - # Set the file paths - iris_dir = os.path.join(kh.get_samples_dir(), "Iris") - dictionary_file_path = os.path.join(iris_dir, "Iris.kdic") - data_table_path = os.path.join(iris_dir, "Iris.txt") - coclustering_report_path = os.path.join( - "kh_samples", - "train_instance_variable_coclustering", - "CoclusteringResults.khcj", - ) - - # Train a coclustering model for variables "SampleId" and "Char" - kh.train_instance_variable_coclustering( - dictionary_file_path, - "Iris", - data_table_path, - coclustering_report_path, - ) - print( - "Instance-variable coclustering report file available " - f"at {coclustering_report_path}" - ) - # If you have Khiops Co-Visualization installed you may open the report as follows # kh.visualize_report(coclustering_report_path) .. autofunction:: simplify_coclustering @@ -1393,11 +1244,9 @@ Samples splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") - output_dir = os.path.join("kh_samples", "simplify_coclustering") - coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj") - simplified_coclustering_file_path = os.path.join( - output_dir, "simplified_coclustering.khcj" - ) + results_dir = os.path.join("kh_samples", "simplify_coclustering") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") + simplified_coclustering_file_name = "simplified_coclustering.khc" # Train coclustering model for variables "SampleId" and "Char" kh.train_coclustering( @@ -1405,7 +1254,7 @@ Samples "SpliceJunctionDNA", data_table_path, ["SampleId", "Char"], - coclustering_file_path, + results_dir, ) # Simplify the trained coclustering with the constraints @@ -1413,7 +1262,8 @@ Samples # - maximum total parts number: 4 kh.simplify_coclustering( coclustering_file_path, - simplified_coclustering_file_path, + simplified_coclustering_file_name, + results_dir, max_preserved_information=80, max_total_parts=4, ) @@ -1424,9 +1274,9 @@ Samples splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") - output_dir = os.path.join("kh_samples", "extract_clusters") - coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj") - clusters_file_path = os.path.join(output_dir, "extracted_clusters.txt") + results_dir = os.path.join("kh_samples", "extract_clusters") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") + clusters_file_path = os.path.join(results_dir, "extracted_clusters.txt") # Train a coclustering model for variables "SampleId" and "Char" kh.train_coclustering( @@ -1434,7 +1284,7 @@ Samples "SpliceJunctionDNA", data_table_path, ["SampleId", "Char"], - coclustering_file_path, + results_dir, ) # Extract clusters @@ -1450,10 +1300,8 @@ Samples splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") - output_dir = os.path.join("kh_samples", "deploy_coclustering") - coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj") - coclustering_dictionary_file_path = os.path.join(output_dir, "Coclustering.kdic") - output_data_table_path = os.path.join(output_dir, "DeployedSpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "deploy_coclustering") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") # Train a coclustering model for variables "SampleId" and "Char" kh.train_coclustering( @@ -1461,7 +1309,7 @@ Samples "SpliceJunctionDNA", data_table_path, ["SampleId", "Char"], - coclustering_file_path, + results_dir, ) # Deploy "Char" clusters in the training database @@ -1472,8 +1320,7 @@ Samples coclustering_file_path, ["SampleId"], "Char", - coclustering_dictionary_file_path, - output_data_table_path, + results_dir, header_line=True, ) .. autofunction:: deploy_coclustering_expert @@ -1488,8 +1335,8 @@ Samples dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunction.txt") secondary_data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") - output_dir = os.path.join("kh_samples", "deploy_coclustering_expert") - coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj") + results_dir = os.path.join("kh_samples", "deploy_coclustering_expert") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") # Train a coclustering model for variables "SampleId" and "Char" print("train coclustering on SpliceJunctionDNA") @@ -1498,38 +1345,39 @@ Samples "SpliceJunctionDNA", secondary_data_table_path, ["SampleId", "Char"], - coclustering_file_path, + results_dir, ) print("prepare_coclustering_deployment") # The input dictionary is extended with new coclustering based variables - augmented_dictionary_file_path = os.path.join(output_dir, "Coclustering.kdic") kh.prepare_coclustering_deployment( dictionary_file_path, "SpliceJunction", coclustering_file_path, "DNA", "SampleId", - augmented_dictionary_file_path, + results_dir, ) + augmented_dictionary_file_path = os.path.join(results_dir, "Coclustering.kdic") print("prepare_coclustering_deployment with at most two clusters") # Extend the already extended dictionary with the new variables from a simplified CC - reaugmented_dictionary_file_path = os.path.join( - output_dir, "ReaugmentedCoclustering.kdic" - ) kh.prepare_coclustering_deployment( augmented_dictionary_file_path, "SpliceJunction", coclustering_file_path, "DNA", "SampleId", - reaugmented_dictionary_file_path, + results_dir, + results_prefix="Reaugmented", variables_prefix="C2_", max_part_numbers={"SampleId": 2}, ) - output_data_table_path = os.path.join(output_dir, "TransferredSpliceJunction.txt") + reaugmented_dictionary_file_path = os.path.join( + results_dir, "ReaugmentedCoclustering.kdic" + ) + output_data_table_path = os.path.join(results_dir, "TransferredSpliceJunction.txt") # Deploy the coclustering with the extended dictionary print("deploy_model with the new coclustering based variables") @@ -1538,11 +1386,11 @@ Samples "SpliceJunction", data_table_path, output_data_table_path, - additional_data_tables={"DNA": secondary_data_table_path}, + additional_data_tables={"SpliceJunction`DNA": secondary_data_table_path}, ) deployed_dictionary_file_path = os.path.join( - output_dir, "Transferred_Coclustering.kdic" + results_dir, "Transferred_Coclustering.kdic" ) print("build_deployed_dictionary to get the new dictionary") kh.build_deployed_dictionary( @@ -1560,9 +1408,7 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - report_file_path = os.path.join( - "kh_samples", "scenario_prologue", "AnalysisResults.khj" - ) + results_dir = os.path.join("kh_samples", "scenario_prologue") # Set the maximum memory "by hand" with an scenario prologue scenario_prologue = """ @@ -1576,7 +1422,7 @@ Samples "Adult", data_table_path, "class", - report_file_path, + results_dir, max_trees=0, scenario_prologue=scenario_prologue, ) @@ -1590,9 +1436,8 @@ Samples # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - output_dir = os.path.join("kh_samples", "build_deployed_dictionary") - deployed_dictionary_file_path = os.path.join(output_dir, "SNB_Iris_deployed.kdic") - report_file_path = os.path.join(output_dir, "AnalysisResults.khj") + results_dir = os.path.join("kh_samples", "build_deployed_dictionary") + deployed_dictionary_file_path = os.path.join(results_dir, "SNB_Iris_deployed.kdic") # Train the predictor _, modeling_dictionary_file_path = kh.train_predictor( @@ -1600,7 +1445,7 @@ Samples "Iris", data_table_path, "Class", - report_file_path, + results_dir, max_trees=0, ) diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index e3fa6e25..f5ce17b8 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -155,7 +155,8 @@ Samples import os import pandas as pd from khiops import core as kh - from khiops.sklearn import KhiopsClassifier, train_test_split_dataset + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics # Load the dataset into pandas dataframes @@ -210,7 +211,8 @@ Samples import os import pandas as pd from khiops import core as kh - from khiops.sklearn import KhiopsClassifier, train_test_split_dataset + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics # Load the dataset tables into dataframes @@ -218,9 +220,7 @@ Samples accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t") users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t") vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - places_df = pd.read_csv( - os.path.join(accidents_data_dir, "Places.txt"), sep="\t", low_memory=False - ) + places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t") # Build the multi-table dataset spec (drop the target column "Gravity") X = { @@ -402,7 +402,7 @@ Samples os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the split root dataframe + # Split the secondary dataframe with the keys of the splitted root dataframe X_train_ids = X_train_main["AccidentId"].to_frame() X_test_ids = X_test_main["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") @@ -590,9 +590,7 @@ Samples accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t") users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t") vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - places_df = pd.read_csv( - os.path.join(accidents_data_dir, "Places.txt"), sep="\t", low_memory=False - ) + places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t") # Build the multi-table dataset spec (drop the target column "Gravity") X = { @@ -727,7 +725,7 @@ Samples keep_initial_variables=True, transform_type_categorical="part_id", transform_type_numerical="part_id", - transform_type_pairs="part_id", + transform_pairs="part_id", ) khe.fit(X, y) @@ -803,3 +801,156 @@ Samples print("Predicted clusters (only three at most)") print(X_clusters) print("---") +.. autofunction:: khiops_classifier_multitable_list +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + + # Load the root table of the dataset into a pandas dataframe + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_data_dir, "Accidents.txt"), + sep="\t", + ) + X = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] + + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) + + # Load the secondary table of the dataset into a pandas dataframe + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") + + # Split the secondary dataframe with the keys of the splitted root dataframe + X_train_ids = X_train["AccidentId"].to_frame() + X_test_ids = X_test["AccidentId"].to_frame() + X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") + X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") + + # Create the classifier specifying the key column name + khc = KhiopsClassifier(key="AccidentId") + + # Train the classifier + khc.fit([X_train, X_train_secondary], y_train) + + # Predict the class on the test dataset + y_test_pred = khc.predict([X_test, X_test_secondary]) + print("Predicted classes (first 10):") + print(y_test_pred[:10]) + print("---") + + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba([X_test, X_test_secondary]) + print("Predicted class probabilities (first 10):") + print(y_test_probas[:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") +.. autofunction:: khiops_classifier_multitable_star_file +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + + # Create output directory + results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file") + if not os.path.exists("kh_samples"): + os.mkdir("kh_samples") + os.mkdir(results_dir) + else: + if not os.path.exists(results_dir): + os.mkdir(results_dir) + + # Load the root table of the dataset into a pandas dataframe + accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_dataset_path, "Accidents.txt"), + sep="\t", + ) + + # Split the root dataframe into train and test + X_train_main, X_test_main = train_test_split( + accidents_df, test_size=0.3, random_state=1 + ) + + # Load the secondary table of the dataset into a pandas dataframe + vehicles_df = pd.read_csv( + os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + ) + + # Split the secondary dataframe with the keys of the splitted root dataframe + X_train_ids = X_train_main["AccidentId"].to_frame() + X_test_ids = X_test_main["AccidentId"].to_frame() + X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") + X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") + + # Write the train and test dataset sets to disk + # For the test file we remove the target column from the main table + X_train_main_path = os.path.join(results_dir, "X_train_main.txt") + X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False) + X_train_secondary_path = os.path.join(results_dir, "X_train_secondary.txt") + X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False) + X_test_main_path = os.path.join(results_dir, "X_test_main.txt") + y_test = X_test_main.sort_values("AccidentId")["Gravity"] + X_test_main.drop(columns="Gravity").to_csv( + X_test_main_path, sep="\t", header=True, index=False + ) + X_test_secondary_path = os.path.join(results_dir, "X_test_secondary.txt") + X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False) + + # Define the dictionary of train + X_train = { + "main_table": "Accidents", + "tables": { + "Accidents": (X_train_main_path, "AccidentId"), + "Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]), + }, + "format": ("\t", True), + } + X_test = { + "main_table": "Accidents", + "tables": { + "Accidents": (X_test_main_path, "AccidentId"), + "Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]), + }, + "format": ("\t", True), + } + + # Create the classifier and fit it + khc = KhiopsClassifier(output_dir=results_dir) + khc.fit(X_train, y="Gravity") + + # Predict the class in addition to the class probabilities on the test dataset + y_test_pred_path = khc.predict(X_test) + y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") + print("Predicted classes (first 10):") + print(y_test_pred["PredictedGravity"].head(10)) + print("---") + + y_test_probas_path = khc.predict_proba(X_test) + y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") + proba_columns = [col for col in y_test_probas if col.startswith("Prob")] + print("Predicted class probabilities (first 10):") + print(y_test_probas[proba_columns].head(10)) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) + test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index c66dccf4..6087dc46 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -157,6 +157,41 @@ def _infer_env_bin_dir_for_conda_based_installations(): return env_bin_dir +def _read_khiops_env_output(khiops_env_path): + """Reads the output of the khiops_env script and puts it in a dictionary""" + # Initialize the output + khiops_env_output = {} + + # Execute khiops_env --env + with subprocess.Popen( + [khiops_env_path, "--env"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) as khiops_env_process: + stdout, stderr = khiops_env_process.communicate() + if khiops_env_process.returncode != 0: + raise KhiopsEnvironmentError( + "Error initializing the environment for Khiops from the " + f"{khiops_env_path} script. Contents of stderr:\n{stderr}" + ) + + # Read the output line and set the keys in the dictionary + for line in stdout.split("\n"): + tokens = line.rstrip().split(maxsplit=1) + if len(tokens) == 2: + var_name, var_value = tokens + elif len(tokens) == 1: + var_name = tokens[0] + var_value = "" + else: + continue + + khiops_env_output[var_name] = var_value + + return khiops_env_output + + def _check_conda_env_bin_dir(conda_env_bin_dir): """Check inferred Conda environment binary directory really is one @@ -184,7 +219,47 @@ def _check_conda_env_bin_dir(conda_env_bin_dir): return is_conda_env_bin_dir -def _infer_khiops_installation_method(trace=False): +def _get_khiops_env_script_path(): + # On Windows native installations, rely on the `KHIOPS_HOME` environment + # variable set by the Khiops Desktop Application installer + installation_method = _infer_khiops_installation_method() + if platform.system() == "Windows" and installation_method == "binary+pip": + # KHIOPS_HOME variable by default + if "KHIOPS_HOME" in os.environ: + khiops_env_script_path = os.path.join( + os.environ["KHIOPS_HOME"], "bin", _get_khiops_env_script_file_name() + ) + # Raise error if KHIOPS_HOME is not set + else: + raise KhiopsEnvironmentError( + "No environment variable named 'KHIOPS_HOME' found. " + "Make sure you have installed Khiops >= 10.2.3. " + "Go to https://khiops.org for more information." + ) + + # In Conda-based environments, `khiops_env` might not be in the PATH, + # hence its path must be inferred + elif installation_method == "conda-based": + khiops_env_script_path = os.path.join( + _infer_env_bin_dir_for_conda_based_installations(), + _get_khiops_env_script_file_name(), + ) + + # On UNIX or Conda, khiops_env is always in path for a proper installation + else: + khiops_env_script_path = shutil.which("khiops_env") + if khiops_env_script_path is None: + raise KhiopsEnvironmentError( + "The 'khiops_env' script not found for the current " + f"'{installation_method}' installation method. Make sure " + "you have installed khiops >= 10.2.3. " + "Go to https://khiops.org for more information." + ) + + return khiops_env_script_path + + +def _infer_khiops_installation_method(): """Return the Khiops installation method""" # We are in a conda environment if # - if the CONDA_PREFIX environment variable exists and, @@ -201,16 +276,12 @@ def _infer_khiops_installation_method(trace=False): # Otherwise, we choose between conda-based and local (default choice) else: env_bin_dir = _infer_env_bin_dir_for_conda_based_installations() - if trace: - print(f"Environment binary dir: '{env_bin_dir}'") if _check_conda_env_bin_dir(env_bin_dir) and _khiops_env_file_exists( env_bin_dir ): installation_method = "conda-based" else: installation_method = "binary+pip" - if trace: - print(f"Installation method: '{installation_method}'") assert installation_method in ("conda", "conda-based", "binary+pip") return installation_method @@ -799,9 +870,6 @@ class KhiopsLocalRunner(KhiopsRunner): def __init__(self): # Define specific attributes - self._mpi_command_args = None - self._khiops_path = None - self._khiops_coclustering_path = None self._khiops_version = None self._samples_dir = None self._samples_dir_checked = False @@ -809,86 +877,23 @@ def __init__(self): # Call parent constructor super().__init__() + # Initialize the khiops_env variables cache + self._khiops_env_cache = {} + # Initialize Khiops environment self._initialize_khiops_environment() def _initialize_khiops_environment(self): - # Check the `khiops_env` script - # On Windows native installations, rely on the `KHIOPS_HOME` environment - # variable set by the Khiops Desktop Application installer - installation_method = _infer_khiops_installation_method() - if platform.system() == "Windows" and installation_method == "binary+pip": - # KHIOPS_HOME variable by default - if "KHIOPS_HOME" in os.environ: - khiops_env_path = os.path.join( - os.environ["KHIOPS_HOME"], "bin", "khiops_env.cmd" - ) - # Raise error if KHIOPS_HOME is not set - else: - raise KhiopsEnvironmentError( - "No environment variable named 'KHIOPS_HOME' found. " - "Make sure you have installed Khiops >= 10.2.3. " - "Go to https://khiops.org for more information." - ) - - # In Conda-based environments, `khiops_env` might not be in the PATH, - # hence its path must be inferred - elif installation_method == "conda-based": - khiops_env_path = os.path.join( - _infer_env_bin_dir_for_conda_based_installations(), "khiops_env" - ) - if platform.system() == "Windows": - khiops_env_path += ".cmd" + # Obtain the khiops_env script path + khiops_env_path = _get_khiops_env_script_path() - # On UNIX or Conda, khiops_env is always in path for a proper installation - else: - khiops_env_path = shutil.which("khiops_env") - if khiops_env_path is None: - raise KhiopsEnvironmentError( - "The 'khiops_env' script not found for the current " - f"'{installation_method}' installation method. Make sure " - "you have installed khiops >= 10.2.3. " - "Go to https://khiops.org for more information." - ) - - with subprocess.Popen( - [khiops_env_path, "--env"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) as khiops_env_process: - stdout, stderr = khiops_env_process.communicate() - if khiops_env_process.returncode != 0: - raise KhiopsEnvironmentError( - "Error initializing the environment for Khiops from the " - f"{khiops_env_path} script. Contents of stderr:\n{stderr}" - ) - for line in stdout.split("\n"): - tokens = line.rstrip().split(maxsplit=1) - if len(tokens) == 2: - var_name, var_value = tokens - elif len(tokens) == 1: - var_name = tokens[0] - var_value = "" - else: - continue - # Set paths to Khiops binaries - if var_name == "KHIOPS_PATH": - self.khiops_path = var_value - os.environ["KHIOPS_PATH"] = var_value - elif var_name == "KHIOPS_COCLUSTERING_PATH": - self.khiops_coclustering_path = var_value - os.environ["KHIOPS_COCLUSTERING_PATH"] = var_value - # Set MPI command - elif var_name == "KHIOPS_MPI_COMMAND": - self._mpi_command_args = shlex.split(var_value) - os.environ["KHIOPS_MPI_COMMAND"] = var_value - # Propagate all the other environment variables to Khiops binaries - else: - os.environ[var_name] = var_value + # Read the contents from the execution of the khiops_env script + # Put them in the cache and in the environment + self._khiops_env_cache = _read_khiops_env_output(khiops_env_path) + os.environ.update(self._khiops_env_cache) - # Set KHIOPS_API_MODE to `true` - os.environ["KHIOPS_API_MODE"] = "true" + # Set KHIOPS_API_MODE to `true` + os.environ["KHIOPS_API_MODE"] = "true" # Check the tools exist and are executable self._check_tools() @@ -979,8 +984,8 @@ def _build_status_message(self): # Build the messages for install type and mpi install_type_msg = _infer_khiops_installation_method() - if self._mpi_command_args: - mpi_command_args_msg = " ".join(self._mpi_command_args) + if "KHIOPS_MPI_COMMAND" in os.environ and os.environ["KHIOPS_MPI_COMMAND"]: + mpi_command_args_msg = os.environ["KHIOPS_MPI_COMMAND"] else: mpi_command_args_msg = "" @@ -988,14 +993,19 @@ def _build_status_message(self): status_msg += "\n\n" status_msg += "khiops local installation settings\n" status_msg += f"version : {self.khiops_version}\n" - status_msg += f"Khiops path : {self.khiops_path}\n" - status_msg += f"Khiops CC path : {self.khiops_coclustering_path}\n" + status_msg += f"Khiops path : {os.environ['KHIOPS_PATH']}\n" + status_msg += ( + f"Khiops CC path : {os.environ['KHIOPS_COCLUSTERING_PATH']}\n" + ) + status_msg += f"khiops_env path : {_get_khiops_env_script_path()}\n" status_msg += f"install type : {install_type_msg}\n" status_msg += f"MPI command : {mpi_command_args_msg}\n" # Add output of khiops -s which gives the MODL_* binary status status_msg += "\n\n" - khiops_executable = os.path.join(os.path.dirname(self.khiops_path), "khiops") + khiops_executable = os.path.join( + os.path.dirname(os.environ["KHIOPS_PATH"]), "khiops" + ) status_msg += f"Khiops executable status (output of '{khiops_executable} -s')\n" stdout, stderr, return_code = self.raw_run("khiops", ["-s"], use_mpi=True) @@ -1017,68 +1027,26 @@ def _get_khiops_version(self): ) return self._khiops_version - @property - def mpi_command_args(self): - return self._mpi_command_args - - @property - def khiops_path(self): - """str: Path to the ``MODL*`` Khiops binary - - Set by the ``khiops_env`` script from the ``khiops-core`` package. - - """ - return self._khiops_path - - @khiops_path.setter - def khiops_path(self, modl_path): - # Check that the path is a directory and it exists - if not os.path.exists(modl_path): - raise KhiopsEnvironmentError(f"Inexistent Khiops path: '{modl_path}'") - if not os.path.isfile(modl_path): - raise KhiopsEnvironmentError( - f"Khiops file path is a directory: {modl_path}" - ) - - # Set the MODL path - self._khiops_path = modl_path - - @property - def khiops_coclustering_path(self): - """str: Path to the ``MODL_Coclustering`` Khiops Coclustering binary - - Set by the ``khiops_env`` script from the ``khiops-core`` package. - - """ - return self._khiops_coclustering_path - - @khiops_coclustering_path.setter - def khiops_coclustering_path(self, modl_coclustering_path): + def check_executable_path(self, exec_path): # Check that the path is a directory and it exists - if not os.path.exists(modl_coclustering_path): + if not os.path.exists(exec_path): raise KhiopsEnvironmentError( - f"Inexistent Khiops coclustering path: '{modl_coclustering_path}'" + f"Inexistent Khiops executable path: " f"{exec_path}" ) - if not os.path.isfile(modl_coclustering_path): + if not os.path.isfile(exec_path): raise KhiopsEnvironmentError( - "Khiops coclustering file path is a directory: " - f"{modl_coclustering_path}" + f"Khiops executable file path is a directory: {exec_path}" ) - # Set the MODL_Coclustering path - self._khiops_coclustering_path = modl_coclustering_path - def _tool_path(self, tool_name): """Full path of a Khiops tool binary""" - assert ( - self.khiops_path is not None and self.khiops_coclustering_path is not None - ) + assert "KHIOPS_PATH" in os.environ and "KHIOPS_COCLUSTERING_PATH" in os.environ tool_name = tool_name.lower() if tool_name not in ["khiops", "khiops_coclustering"]: raise ValueError(f"Invalid tool name: {tool_name}") modl_binaries = { - "khiops": self.khiops_path, - "khiops_coclustering": self.khiops_coclustering_path, + "khiops": os.environ["KHIOPS_PATH"], + "khiops_coclustering": os.environ["KHIOPS_COCLUSTERING_PATH"], } bin_path = modl_binaries[tool_name] @@ -1124,11 +1092,14 @@ def raw_run(self, tool_name, command_line_args=None, use_mpi=True, trace=False): type_error_message("command_line_args", command_line_args, list) ) + # Refresh the environment (only if needed) + self._refresh_khiops_environment() + # Build command line arguments # Nota: Khiops Coclustering is executed without MPI khiops_process_args = [] if tool_name == "khiops" and use_mpi: - khiops_process_args += self._mpi_command_args + khiops_process_args += shlex.split(os.environ["KHIOPS_MPI_COMMAND"]) khiops_process_args += [self._tool_path(tool_name)] if command_line_args: khiops_process_args += command_line_args @@ -1160,6 +1131,22 @@ def raw_run(self, tool_name, command_line_args=None, use_mpi=True, trace=False): return stdout, stderr, khiops_process.returncode + def _refresh_khiops_environment(self): + """Update the Khiops environment khiops_env only if variables have changed""" + # Check if the cache is invalidated + cache_invalidated = False + for cached_var_name, cached_var_value in self._khiops_env_cache.items(): + if ( + cached_var_name not in os.environ + or os.environ[cached_var_name] != cached_var_value + ): + cache_invalidated = True + break + + # If the cache is invalidated then refresh the environment + if cache_invalidated: + self._initialize_khiops_environment() + def _run( self, tool_name, diff --git a/khiops/samples/samples.ipynb b/khiops/samples/samples.ipynb index e4ed5f8c..5cd9c434 100644 --- a/khiops/samples/samples.ipynb +++ b/khiops/samples/samples.ipynb @@ -78,12 +78,12 @@ "# Set the file paths\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - "output_dir = os.path.join(\"kh_samples\", \"detect_data_table_format\")\n", - "transformed_data_table_path = os.path.join(output_dir, \"AdultWithAnotherFormat.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"detect_data_table_format\")\n", + "transformed_data_table_path = os.path.join(results_dir, \"AdultWithAnotherFormat.txt\")\n", "\n", "# Create the output directory\n", - "if not os.path.isdir(output_dir):\n", - " os.mkdir(output_dir)\n", + "if not os.path.isdir(results_dir):\n", + " os.mkdir(results_dir)\n", "\n", "# Detect the format of the table\n", "format_spec = kh.detect_data_table_format(data_table_path)\n", @@ -166,10 +166,10 @@ "\n", "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - "output_dir = os.path.join(\"kh_samples\", \"export_dictionary_file\")\n", - "output_dictionary_file_path = os.path.join(output_dir, \"ModifiedAdult.kdic\")\n", - "output_dictionary_json_path = os.path.join(output_dir, \"ModifiedAdult.kdicj\")\n", - "alt_output_dictionary_json_path = os.path.join(output_dir, \"AltModifiedAdult.kdicj\")\n", + "results_dir = os.path.join(\"kh_samples\", \"export_dictionary_file\")\n", + "output_dictionary_file_path = os.path.join(results_dir, \"ModifiedAdult.kdic\")\n", + "output_dictionary_json_path = os.path.join(results_dir, \"ModifiedAdult.kdicj\")\n", + "alt_output_dictionary_json_path = os.path.join(results_dir, \"AltModifiedAdult.kdicj\")\n", "\n", "# Load the dictionary domain from initial dictionary file\n", "# Then obtain the \"Adult\" dictionary within\n", @@ -185,10 +185,10 @@ "# Create output directory if necessary\n", "if not os.path.exists(\"kh_samples\"):\n", " os.mkdir(\"kh_samples\")\n", - " os.mkdir(output_dir)\n", + " os.mkdir(results_dir)\n", "else:\n", - " if not os.path.exists(output_dir):\n", - " os.mkdir(output_dir)\n", + " if not os.path.exists(results_dir):\n", + " os.mkdir(results_dir)\n", "\n", "# Export to kdic\n", "domain.export_khiops_dictionary_file(output_dictionary_file_path)\n", @@ -222,9 +222,7 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "analysis_report_file_path = os.path.join(\n", - " \"kh_samples\", \"train_predictor\", \"AnalysisReport.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor\")\n", "\n", "# Train the predictor\n", "kh.train_predictor(\n", @@ -232,7 +230,7 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " analysis_report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", ")" ] @@ -258,17 +256,15 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\", \"train_predictor_file_paths\", \"AnalysisResults.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_file_paths\")\n", "\n", "# Train the predictor\n", - "_, modeling_dictionary_file_path = kh.train_predictor(\n", + "report_file_path, modeling_dictionary_file_path = kh.train_predictor(\n", " dictionary_file_path,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", ")\n", "print(\"Reports file available at \" + report_file_path)\n", @@ -299,10 +295,9 @@ "# Set the file paths with a nonexistent dictionary file\n", "dictionary_file_path = \"NONEXISTENT_DICTIONARY_FILE.kdic\"\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"train_predictor_error_handling\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "log_file_path = os.path.join(output_dir, \"khiops.log\")\n", - "scenario_path = os.path.join(output_dir, \"scenario._kh\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_error_handling\")\n", + "log_file_path = os.path.join(results_dir, \"khiops.log\")\n", + "scenario_path = os.path.join(results_dir, \"scenario._kh\")\n", "\n", "# Train the predictor and handle the error\n", "try:\n", @@ -311,7 +306,7 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " trace=True,\n", " log_file_path=log_file_path,\n", " output_scenario_path=scenario_path,\n", @@ -356,9 +351,7 @@ "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\", \"train_predictor_mt\", \"AnalysisResults.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_mt\")\n", "\n", "# Train the predictor. Besides the mandatory parameters, we specify:\n", "# - A python dictionary linking data paths to file paths for non-root tables\n", @@ -369,8 +362,8 @@ " \"Accident\",\n", " accidents_table_path,\n", " \"Gravity\",\n", - " report_file_path,\n", - " additional_data_tables={\"Vehicles\": vehicles_table_path},\n", + " results_dir,\n", + " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", " max_trees=0,\n", ")" ] @@ -398,11 +391,7 @@ "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\",\n", - " \"train_predictor_mt_with_specific_rules\",\n", - " \"AnalysisResults.khj\",\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_mt_with_specific_rules\")\n", "\n", "# Train the predictor. Besides the mandatory parameters, it is specified:\n", "# - A python dictionary linking data paths to file paths for non-root tables\n", @@ -414,8 +403,8 @@ " \"Accident\",\n", " accidents_table_path,\n", " \"Gravity\",\n", - " report_file_path,\n", - " additional_data_tables={\"Vehicles\": vehicles_table_path},\n", + " results_dir,\n", + " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", " max_constructed_variables=1000,\n", " construction_rules=[\"TableMode\", \"TableSelection\"],\n", " max_trees=0,\n", @@ -447,9 +436,7 @@ "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", "users_table_path = os.path.join(accidents_dir, \"Users.txt\")\n", "places_table_path = os.path.join(accidents_dir, \"Places.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\", \"train_predictor_mt_snowflake\", \"AnalysisResults.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_mt_snowflake\")\n", "\n", "# Train the predictor. Besides the mandatory parameters, we specify:\n", "# - A python dictionary linking data paths to file paths for non-root tables\n", @@ -460,11 +447,11 @@ " \"Accident\",\n", " accidents_table_path,\n", " \"Gravity\",\n", - " report_file_path,\n", + " results_dir,\n", " additional_data_tables={\n", - " \"Vehicles\": vehicles_table_path,\n", - " \"Vehicles/Users\": users_table_path,\n", - " \"Place\": places_table_path,\n", + " \"Accident`Vehicles\": vehicles_table_path,\n", + " \"Accident`Vehicles`Users\": users_table_path,\n", + " \"Accident`Place\": places_table_path,\n", " },\n", " max_trees=0,\n", ")" @@ -491,11 +478,7 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\",\n", - " \"train_predictor_with_train_percentage\",\n", - " \"P90_AnalysisResults.khj\",\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_train_percentage\")\n", "\n", "# Train the predictor. Besides the mandatory parameters, it is specified:\n", "# - A 90% sampling rate for the training dataset\n", @@ -506,10 +489,11 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " sample_percentage=90,\n", " use_complement_as_test=True,\n", " max_trees=0,\n", + " results_prefix=\"P90_\",\n", ")" ] }, @@ -534,9 +518,7 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Letter\", \"Letter.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Letter\", \"Letter.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\", \"train_predictor_with_trees\", \"P80_AnalysisResults.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_trees\")\n", "\n", "# Train the predictor with at most 15 trees (default 10)\n", "kh.train_predictor(\n", @@ -544,9 +526,10 @@ " \"Letter\",\n", " data_table_path,\n", " \"lettr\",\n", - " report_file_path,\n", + " results_dir,\n", " sample_percentage=80,\n", " use_complement_as_test=True,\n", + " results_prefix=\"P80_\",\n", " max_trees=15,\n", ")" ] @@ -572,9 +555,7 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\", \"train_predictor_with_pairs\", \"AnalysisResults.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_pairs\")\n", "\n", "# Train the predictor with at most 10 pairs as follows:\n", "# - Include pairs age-race and capital_gain-capital_loss\n", @@ -584,7 +565,7 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " use_complement_as_test=True,\n", " max_trees=0,\n", " max_pairs=10,\n", @@ -617,10 +598,9 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"train_predictor_with_multiple_parameters\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "output_script_path = os.path.join(output_dir, \"output_scenario._kh\")\n", - "log_path = os.path.join(output_dir, \"log.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_multiple_parameters\")\n", + "output_script_path = os.path.join(results_dir, \"output_scenario._kh\")\n", + "log_path = os.path.join(results_dir, \"log.txt\")\n", "\n", "# Train the predictor. Besides the mandatory parameters, we specify:\n", "# - The value \"more\" as main target value\n", @@ -633,7 +613,7 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " main_target_value=\"more\",\n", " output_scenario_path=output_script_path,\n", " log_file_path=log_path,\n", @@ -663,9 +643,8 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"train_predictor_detect_format\")\n", - "transformed_data_table_path = os.path.join(output_dir, \"TransformedIris.txt\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_detect_format\")\n", + "transformed_data_table_path = os.path.join(results_dir, \"TransformedIris.txt\")\n", "\n", "# Transform the database format from header_line=True and field_separator=TAB\n", "# to header_line=False and field_separator=\",\"\n", @@ -686,7 +665,7 @@ " \"Iris\",\n", " transformed_data_table_path,\n", " \"Class\",\n", - " report_file_path,\n", + " results_dir,\n", " header_line=True,\n", " field_separator=\"\",\n", " )\n", @@ -703,7 +682,7 @@ " \"Iris\",\n", " transformed_data_table_path,\n", " \"Class\",\n", - " report_file_path,\n", + " results_dir,\n", ")" ] }, @@ -729,12 +708,12 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"train_predictor_with_cross_validation\")\n", - "fold_dictionary_file_path = os.path.join(output_dir, \"AdultWithFolding.kdic\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_cross_validation\")\n", + "fold_dictionary_file_path = os.path.join(results_dir, \"AdultWithFolding.kdic\")\n", "\n", "# Create the output directory\n", - "if not os.path.isdir(output_dir):\n", - " os.mkdir(output_dir)\n", + "if not os.path.isdir(results_dir):\n", + " os.mkdir(results_dir)\n", "\n", "# Load the learning dictionary object\n", "domain = kh.read_dictionary_file(dictionary_file_path)\n", @@ -771,39 +750,35 @@ "train_aucs = []\n", "test_aucs = []\n", "for fold_index in range(1, fold_number + 1):\n", - " analysis_report_file_path = os.path.join(\n", - " output_dir, \"Fold\" + str(fold_index) + \"AnalysisResults.khj\"\n", - " )\n", " # Train a model from the sub-dataset where IsInTrainDataset is 1\n", - " _, modeling_dictionary_file_path = kh.train_predictor(\n", + " train_reports_path, modeling_dictionary_file_path = kh.train_predictor(\n", " domain,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " analysis_report_file_path,\n", + " results_dir,\n", " sample_percentage=100,\n", " selection_variable=\"IsInTrainDataset\" + str(fold_index),\n", " selection_value=1,\n", " max_trees=0,\n", + " results_prefix=\"Fold\" + str(fold_index),\n", " )\n", "\n", - " evaluation_report_file_path = os.path.join(\n", - " output_dir, \"Fold\" + str(fold_index) + \"AdultEvaluationResults.khj\"\n", - " )\n", " # Evaluate the resulting model in the subsets where IsInTrainDataset is 0\n", " test_evaluation_report_path = kh.evaluate_predictor(\n", " modeling_dictionary_file_path,\n", - " \"SNB_Adult\",\n", + " \"Adult\",\n", " data_table_path,\n", - " evaluation_report_file_path,\n", + " results_dir,\n", " sample_percentage=100,\n", " selection_variable=\"IsInTrainDataset\" + str(fold_index),\n", " selection_value=0,\n", + " results_prefix=\"Fold\" + str(fold_index),\n", " )\n", "\n", " # Obtain the train AUC from the train report and the test AUC from the\n", " # evaluation report and print them\n", - " train_results = kh.read_analysis_results_file(analysis_report_file_path)\n", + " train_results = kh.read_analysis_results_file(train_reports_path)\n", " test_evaluation_results = kh.read_analysis_results_file(test_evaluation_report_path)\n", " train_auc = train_results.train_evaluation_report.get_snb_performance().auc\n", " test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc\n", @@ -827,45 +802,6 @@ "print(\"test auc: \" + str(mean_test_auc) + \" +- \" + str(sd_test_auc))" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `interpret_predictor()`\n\n", - "Builds interpretation model for existing predictor\n\n It calls `~.api.train_predictor` and `~.api.interpret_predictor` only with\n their mandatory parameters.\n \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Imports\n", - "import os\n", - "from khiops import core as kh\n", - "\n", - "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"interpret_predictor\")\n", - "analysis_report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "interpretor_file_path = os.path.join(output_dir, \"InterpretationModel.kdic\")\n", - "\n", - "# Build prediction model\n", - "_, predictor_file_path = kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " analysis_report_file_path,\n", - ")\n", - "\n", - "# Build interpretation model\n", - "kh.interpret_predictor(predictor_file_path, \"SNB_Adult\", interpretor_file_path)\n", - "\n", - "print(f\"The interpretation model is '{interpretor_file_path}'\")" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -903,8 +839,7 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"multiple_train_predictor\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", + "results_dir = os.path.join(\"kh_samples\", \"multiple_train_predictor\")\n", "\n", "# Read the dictionary file to obtain an instance of class Dictionary\n", "dictionary_domain = kh.read_dictionary_file(dictionary_file_path)\n", @@ -912,20 +847,20 @@ "\n", "# Train a SNB model using all the variables\n", "print(\"\\t#vars\\ttrain auc\\ttest auc\")\n", - "kh.train_predictor(\n", + "json_result_file_path, _ = kh.train_predictor(\n", " dictionary_file_path,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " sample_percentage=70,\n", " use_complement_as_test=True,\n", " max_trees=0,\n", ")\n", - "display_test_results(report_file_path)\n", + "display_test_results(json_result_file_path)\n", "\n", "# Read results to obtain the variables sorted by decreasing Level\n", - "analysis_results = kh.read_analysis_results_file(report_file_path)\n", + "analysis_results = kh.read_analysis_results_file(json_result_file_path)\n", "preparation_results = analysis_results.preparation_report\n", "\n", "# Train a sequence of models with a decreasing number of variables\n", @@ -939,22 +874,21 @@ " dictionary.get_variable(variable.name).used = False\n", "\n", " # Train the model with this dictionary domain object\n", - " report_file_path = os.path.join(\n", - " output_dir, f\"V{variable_number - 1 - i}_AnalysisResults.khj\"\n", - " )\n", - " kh.train_predictor(\n", + " prefix = f\"V{variable_number - 1 - i}_\"\n", + " json_result_file_path, _ = kh.train_predictor(\n", " dictionary_domain,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " sample_percentage=70,\n", " use_complement_as_test=True,\n", + " results_prefix=prefix,\n", " max_trees=0,\n", " )\n", "\n", " # Show a preview of the results\n", - " display_test_results(report_file_path)" + " display_test_results(json_result_file_path)" ] }, { @@ -978,29 +912,24 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"evaluate_predictor\")\n", - "analysis_report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", + "results_dir = os.path.join(\"kh_samples\", \"evaluate_predictor\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", "\n", "# Train the predictor\n", - "_, model_dictionary_file_path = kh.train_predictor(\n", + "kh.train_predictor(\n", " dictionary_file_path,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " analysis_report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", ")\n", "\n", - "evaluation_report_file_path = os.path.join(output_dir, \"AdultEvaluationResults.khj\")\n", - "\n", "# Evaluate the predictor\n", - "kh.evaluate_predictor(\n", - " model_dictionary_file_path,\n", - " \"SNB_Adult\",\n", - " data_table_path,\n", - " evaluation_report_file_path,\n", + "report_file_path = kh.evaluate_predictor(\n", + " model_dictionary_file_path, \"Adult\", data_table_path, results_dir\n", ")\n", - "print(\"Evaluation report available at \" + evaluation_report_file_path)" + "print(\"Evaluation report available at \" + report_file_path)" ] }, { @@ -1024,9 +953,8 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\", \"access_predictor_evaluation_report\", \"AdultAnalysisReport.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"access_predictor_evaluation_report\")\n", + "evaluation_report_path = os.path.join(results_dir, \"AllReports.khj\")\n", "\n", "# Train the SNB predictor and some univariate predictors\n", "# Note: Evaluation in test is 30% by default\n", @@ -1035,12 +963,13 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", + " univariate_predictor_number=4,\n", ")\n", "\n", "# Obtain the evaluation results\n", - "results = kh.read_analysis_results_file(report_file_path)\n", + "results = kh.read_analysis_results_file(evaluation_report_path)\n", "evaluation_report = results.test_evaluation_report\n", "snb_performance = evaluation_report.get_snb_performance()\n", "\n", @@ -1080,9 +1009,9 @@ " + str(random_lift_curve.values[i])\n", " )\n", "\n", - "# Print metrics for an SNB predictor\n", + "# Print univariate metrics for an univariate predictor\n", "predictor_performance = evaluation_report.get_predictor_performance(\n", - " \"Selective Naive Bayes\"\n", + " \"Univariate relationship\"\n", ")\n", "print(\"\\n\\nperformance metrics for \" + predictor_performance.name)\n", "for metric_name in predictor_performance.get_metric_names():\n", @@ -1110,12 +1039,10 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "report_file_path = os.path.join(\"kh_samples\", \"train_recoder\", \"AnalysisResults.khj\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_recoder\")\n", "\n", "# Train the recoder model\n", - "kh.train_recoder(\n", - " dictionary_file_path, \"Adult\", data_table_path, \"class\", report_file_path\n", - ")" + "kh.train_recoder(dictionary_file_path, \"Adult\", data_table_path, \"class\", results_dir)" ] }, { @@ -1139,11 +1066,7 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\",\n", - " \"train_recoder_with_multiple_parameters\",\n", - " \"AnalysisResults.khj\",\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_recoder_with_multiple_parameters\")\n", "\n", "# Train the recoder model\n", "kh.train_recoder(\n", @@ -1151,7 +1074,7 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " max_pairs=10,\n", " categorical_recoding_method=\"part label\",\n", " numerical_recoding_method=\"part label\",\n", @@ -1181,9 +1104,7 @@ "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\", \"train_recoder_mt_flatten\", \"AnalysisResults.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_recoder_mt_flatten\")\n", "\n", "# Train the recoder. Besides the mandatory parameters, it is specified:\n", "# - A python dictionary linking data paths to file paths for non-root tables\n", @@ -1195,8 +1116,8 @@ " \"Accident\",\n", " accidents_table_path,\n", " \"Gravity\",\n", - " report_file_path,\n", - " additional_data_tables={\"Vehicles\": vehicles_table_path},\n", + " results_dir,\n", + " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", " max_constructed_variables=1000,\n", " informative_variables_only=False,\n", " categorical_recoding_method=\"none\",\n", @@ -1211,7 +1132,7 @@ "metadata": {}, "source": [ "### `deploy_model()`\n\n", - "Deploys a model in the simplest way possible\n\n It is a call to `~.api.deploy_model` with its mandatory parameters.\n\n In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n associated dictionary to the input database. The model predictions are written to\n the output database.\n \n" + "Deploys a model in the simplest possible\n\n It is a call to `~.api.deploy_model` with its mandatory parameters.\n\n In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n associated dictionary to the input database. The model predictions are written to\n the output database.\n \n" ] }, { @@ -1227,17 +1148,17 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_model\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "output_data_table_path = os.path.join(output_dir, \"ScoresAdult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_model\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "output_data_table_path = os.path.join(results_dir, \"ScoresAdult.txt\")\n", "\n", "# Train the predictor\n", - "_, model_dictionary_file_path = kh.train_predictor(\n", + "kh.train_predictor(\n", " dictionary_file_path,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", ")\n", "\n", @@ -1271,18 +1192,18 @@ "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_model_mt\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "output_data_table_path = os.path.join(output_dir, \"TransferredAccidents.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_model_mt\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "output_data_table_path = os.path.join(results_dir, \"TransferredAccidents.txt\")\n", "\n", "# Train the predictor (see train_predictor_mt for details)\n", - "_, model_dictionary_file_path = kh.train_predictor(\n", + "kh.train_predictor(\n", " dictionary_file_path,\n", " \"Accident\",\n", " accidents_table_path,\n", " \"Gravity\",\n", - " report_file_path,\n", - " additional_data_tables={\"Vehicles\": vehicles_table_path},\n", + " results_dir,\n", + " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", " max_trees=0,\n", ")\n", "\n", @@ -1294,69 +1215,7 @@ " \"SNB_Accident\",\n", " accidents_table_path,\n", " output_data_table_path,\n", - " additional_data_tables={\"Vehicles\": vehicles_table_path},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `deploy_model_mt_with_interpretation()`\n\n", - "Deploys a multi-table interpretor in the simplest way possible\n\n It is a call to `~.api.deploy_model` with additional parameters to handle\n multi-table deployment.\n\n In this example, a Selective Naive Bayes (SNB) interpretation model is\n deployed by applying its associated dictionary to the input database.\n The model variable importances are written to the output data table.\n \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Imports\n", - "import os\n", - "from khiops import core as kh\n", - "\n", - "# Set the file paths\n", - "accidents_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", - "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", - "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_model_mt\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "interpretor_file_path = os.path.join(output_dir, \"InterpretationModel.kdic\")\n", - "output_data_table_path = os.path.join(output_dir, \"InterpretedAccidents.txt\")\n", - "\n", - "# Train the predictor (see train_predictor_mt for details)\n", - "# Add max_evaluated_variables so that an interpretation model can be built\n", - "# (see https://github.com/KhiopsML/khiops/issues/577)\n", - "_, model_dictionary_file_path = kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Accident\",\n", - " accidents_table_path,\n", - " \"Gravity\",\n", - " report_file_path,\n", - " additional_data_tables={\"Vehicles\": vehicles_table_path},\n", - " max_trees=0,\n", - " max_evaluated_variables=10,\n", - ")\n", - "\n", - "# Interpret the predictor\n", - "kh.interpret_predictor(\n", - " model_dictionary_file_path,\n", - " \"SNB_Accident\",\n", - " interpretor_file_path,\n", - " reinforcement_target_value=\"NonLethal\",\n", - ")\n", - "\n", - "# Deploy the interpretation model on the database\n", - "# Besides the mandatory parameters, it is specified:\n", - "# - A python dictionary linking data paths to file paths for non-root tables\n", - "kh.deploy_model(\n", - " interpretor_file_path,\n", - " \"Interpretation_SNB_Accident\",\n", - " accidents_table_path,\n", - " output_data_table_path,\n", - " additional_data_tables={\"Vehicles\": vehicles_table_path},\n", + " additional_data_tables={\"SNB_Accident`Vehicles\": vehicles_table_path},\n", ")" ] }, @@ -1385,24 +1244,24 @@ "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", "users_table_path = os.path.join(accidents_dir, \"Users.txt\")\n", "places_table_path = os.path.join(accidents_dir, \"Places.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_model_mt_snowflake\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "output_data_table_path = os.path.join(output_dir, \"TransferredAccidents.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_model_mt_snowflake\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "output_data_table_path = os.path.join(results_dir, \"TransferredAccidents.txt\")\n", "\n", "# Train the predictor. Besides the mandatory parameters, we specify:\n", "# - A python dictionary linking data paths to file paths for non-root tables\n", "# - To not construct any decision tree\n", "# The default number of automatic features is 100\n", - "_, model_dictionary_file_path = kh.train_predictor(\n", + "kh.train_predictor(\n", " dictionary_file_path,\n", " \"Accident\",\n", " accidents_table_path,\n", " \"Gravity\",\n", - " report_file_path,\n", + " results_dir,\n", " additional_data_tables={\n", - " \"Vehicles\": vehicles_table_path,\n", - " \"Vehicles/Users\": users_table_path,\n", - " \"Place\": places_table_path,\n", + " \"Accident`Vehicles\": vehicles_table_path,\n", + " \"Accident`Vehicles`Users\": users_table_path,\n", + " \"Accident`Place\": places_table_path,\n", " },\n", " max_trees=0,\n", ")\n", @@ -1416,9 +1275,9 @@ " accidents_table_path,\n", " output_data_table_path,\n", " additional_data_tables={\n", - " \"Vehicles\": vehicles_table_path,\n", - " \"Vehicles/Users\": users_table_path,\n", - " \"Place\": places_table_path,\n", + " \"SNB_Accident`Vehicles\": vehicles_table_path,\n", + " \"SNB_Accident`Vehicles`Users\": users_table_path,\n", + " \"SNB_Accident`Place\": places_table_path,\n", " },\n", ")" ] @@ -1444,17 +1303,17 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_model_expert\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "output_data_table_path = os.path.join(output_dir, \"ScoresAdult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_model_expert\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "output_data_table_path = os.path.join(results_dir, \"ScoresAdult.txt\")\n", "\n", "# Train the predictor\n", - "_, model_dictionary_file_path = kh.train_predictor(\n", + "kh.train_predictor(\n", " dictionary_file_path,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", ")\n", "\n", @@ -1498,9 +1357,8 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_classifier_for_metrics\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "output_data_table_path = os.path.join(output_dir, \"ScoresAdult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_classifier_for_metrics\")\n", + "output_data_table_path = os.path.join(results_dir, \"ScoresAdult.txt\")\n", "\n", "# Train the classifier for the target \"class\"\n", "_, modeling_dictionary_file_path = kh.train_predictor(\n", @@ -1508,9 +1366,10 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", ")\n", + "\n", "# Obtain the scores of the SNB on the test dataset to calculate the PR curve\n", "kh.deploy_predictor_for_metrics(\n", " modeling_dictionary_file_path,\n", @@ -1578,9 +1437,8 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_regressor_for_metrics\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", - "output_data_table_path = os.path.join(output_dir, \"TrueAndPredictedAges.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_regressor_for_metrics\")\n", + "output_data_table_path = os.path.join(results_dir, \"TrueAndPredictedAges.txt\")\n", "\n", "# Train the regressor for the target \"age\" (with 20% train to be quick)\n", "_, modeling_dictionary_file_path = kh.train_predictor(\n", @@ -1588,7 +1446,7 @@ " \"Adult\",\n", " data_table_path,\n", " \"age\",\n", - " report_file_path,\n", + " results_dir,\n", " sample_percentage=20,\n", " max_trees=0,\n", ")\n", @@ -1763,17 +1621,15 @@ "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - "coclustering_report_path = os.path.join(\n", - " \"kh_samples\", \"train_coclustering\", \"CoclusteringResults.khcj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_coclustering\")\n", "\n", "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", - "kh.train_coclustering(\n", + "coclustering_report_path = kh.train_coclustering(\n", " dictionary_file_path,\n", " \"SpliceJunctionDNA\",\n", " data_table_path,\n", " [\"SampleId\", \"Char\"],\n", - " coclustering_report_path,\n", + " results_dir,\n", ")\n", "print(f\"Coclustering report file available at {coclustering_report_path}\")\n", "\n", @@ -1781,50 +1637,6 @@ "# kh.visualize_report(coclustering_report_path)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `train_instance_variable_coclustering()`\n\n", - "Trains an instance-variable coclustering model in the simplest way possible\n\n It is a call to `~.api.train_instance_variable_coclustering` with only its mandatory\n parameters.\n \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Imports\n", - "import os\n", - "from khiops import core as kh\n", - "\n", - "# Set the file paths\n", - "iris_dir = os.path.join(kh.get_samples_dir(), \"Iris\")\n", - "dictionary_file_path = os.path.join(iris_dir, \"Iris.kdic\")\n", - "data_table_path = os.path.join(iris_dir, \"Iris.txt\")\n", - "coclustering_report_path = os.path.join(\n", - " \"kh_samples\",\n", - " \"train_instance_variable_coclustering\",\n", - " \"CoclusteringResults.khcj\",\n", - ")\n", - "\n", - "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", - "kh.train_instance_variable_coclustering(\n", - " dictionary_file_path,\n", - " \"Iris\",\n", - " data_table_path,\n", - " coclustering_report_path,\n", - ")\n", - "print(\n", - " \"Instance-variable coclustering report file available \"\n", - " f\"at {coclustering_report_path}\"\n", - ")\n", - "\n", - "# If you have Khiops Co-Visualization installed you may open the report as follows\n", - "# kh.visualize_report(coclustering_report_path)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1847,11 +1659,9 @@ "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"simplify_coclustering\")\n", - "coclustering_file_path = os.path.join(output_dir, \"Coclustering.khcj\")\n", - "simplified_coclustering_file_path = os.path.join(\n", - " output_dir, \"simplified_coclustering.khcj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"simplify_coclustering\")\n", + "coclustering_file_path = os.path.join(results_dir, \"Coclustering.khc\")\n", + "simplified_coclustering_file_name = \"simplified_coclustering.khc\"\n", "\n", "# Train coclustering model for variables \"SampleId\" and \"Char\"\n", "kh.train_coclustering(\n", @@ -1859,7 +1669,7 @@ " \"SpliceJunctionDNA\",\n", " data_table_path,\n", " [\"SampleId\", \"Char\"],\n", - " coclustering_file_path,\n", + " results_dir,\n", ")\n", "\n", "# Simplify the trained coclustering with the constraints\n", @@ -1867,7 +1677,8 @@ "# - maximum total parts number: 4\n", "kh.simplify_coclustering(\n", " coclustering_file_path,\n", - " simplified_coclustering_file_path,\n", + " simplified_coclustering_file_name,\n", + " results_dir,\n", " max_preserved_information=80,\n", " max_total_parts=4,\n", ")" @@ -1891,9 +1702,9 @@ "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"extract_clusters\")\n", - "coclustering_file_path = os.path.join(output_dir, \"Coclustering.khcj\")\n", - "clusters_file_path = os.path.join(output_dir, \"extracted_clusters.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"extract_clusters\")\n", + "coclustering_file_path = os.path.join(results_dir, \"Coclustering.khc\")\n", + "clusters_file_path = os.path.join(results_dir, \"extracted_clusters.txt\")\n", "\n", "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", "kh.train_coclustering(\n", @@ -1901,7 +1712,7 @@ " \"SpliceJunctionDNA\",\n", " data_table_path,\n", " [\"SampleId\", \"Char\"],\n", - " coclustering_file_path,\n", + " results_dir,\n", ")\n", "\n", "# Extract clusters\n", @@ -1913,7 +1724,7 @@ "metadata": {}, "source": [ "### `deploy_coclustering()`\n\n", - "Deploys a coclustering\n" + "Deploys an \"individual-variable\" coclustering\n" ] }, { @@ -1930,10 +1741,8 @@ "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_coclustering\")\n", - "coclustering_file_path = os.path.join(output_dir, \"Coclustering.khcj\")\n", - "coclustering_dictionary_file_path = os.path.join(output_dir, \"Coclustering.kdic\")\n", - "output_data_table_path = os.path.join(output_dir, \"DeployedSpliceJunctionDNA.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_coclustering\")\n", + "coclustering_file_path = os.path.join(results_dir, \"Coclustering.khc\")\n", "\n", "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", "kh.train_coclustering(\n", @@ -1941,7 +1750,7 @@ " \"SpliceJunctionDNA\",\n", " data_table_path,\n", " [\"SampleId\", \"Char\"],\n", - " coclustering_file_path,\n", + " results_dir,\n", ")\n", "\n", "# Deploy \"Char\" clusters in the training database\n", @@ -1952,8 +1761,7 @@ " coclustering_file_path,\n", " [\"SampleId\"],\n", " \"Char\",\n", - " coclustering_dictionary_file_path,\n", - " output_data_table_path,\n", + " results_dir,\n", " header_line=True,\n", ")" ] @@ -1963,7 +1771,7 @@ "metadata": {}, "source": [ "### `deploy_coclustering_expert()`\n\n", - "Deploys a coclustering step-by-step\n\n The `.api.prepare_coclustering_deployment` method is called twice to prepare the\n deployment at two granularity levels. Then, the model is deployed and the respective\n deployment dictionary is built.\n\n This is one of the most complex workflows of the Khiops suite.\n \n" + "Deploys an \"individual-variable\" coclustering step-by-step\n\n The `.api.prepare_coclustering_deployment` method is called twice to prepare the\n deployment at two granularity levels. Then, the model is deployed and the respective\n deployment dictionary is built.\n\n This is one of the most complex workflows of the Khiops suite.\n \n" ] }, { @@ -1981,8 +1789,8 @@ "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", "data_table_path = os.path.join(splice_dir, \"SpliceJunction.txt\")\n", "secondary_data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"deploy_coclustering_expert\")\n", - "coclustering_file_path = os.path.join(output_dir, \"Coclustering.khcj\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_coclustering_expert\")\n", + "coclustering_file_path = os.path.join(results_dir, \"Coclustering.khc\")\n", "\n", "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", "print(\"train coclustering on SpliceJunctionDNA\")\n", @@ -1991,38 +1799,39 @@ " \"SpliceJunctionDNA\",\n", " secondary_data_table_path,\n", " [\"SampleId\", \"Char\"],\n", - " coclustering_file_path,\n", + " results_dir,\n", ")\n", "\n", "print(\"prepare_coclustering_deployment\")\n", "# The input dictionary is extended with new coclustering based variables\n", - "augmented_dictionary_file_path = os.path.join(output_dir, \"Coclustering.kdic\")\n", "kh.prepare_coclustering_deployment(\n", " dictionary_file_path,\n", " \"SpliceJunction\",\n", " coclustering_file_path,\n", " \"DNA\",\n", " \"SampleId\",\n", - " augmented_dictionary_file_path,\n", + " results_dir,\n", ")\n", + "augmented_dictionary_file_path = os.path.join(results_dir, \"Coclustering.kdic\")\n", "\n", "print(\"prepare_coclustering_deployment with at most two clusters\")\n", "# Extend the already extended dictionary with the new variables from a simplified CC\n", - "reaugmented_dictionary_file_path = os.path.join(\n", - " output_dir, \"ReaugmentedCoclustering.kdic\"\n", - ")\n", "kh.prepare_coclustering_deployment(\n", " augmented_dictionary_file_path,\n", " \"SpliceJunction\",\n", " coclustering_file_path,\n", " \"DNA\",\n", " \"SampleId\",\n", - " reaugmented_dictionary_file_path,\n", + " results_dir,\n", + " results_prefix=\"Reaugmented\",\n", " variables_prefix=\"C2_\",\n", " max_part_numbers={\"SampleId\": 2},\n", ")\n", "\n", - "output_data_table_path = os.path.join(output_dir, \"TransferredSpliceJunction.txt\")\n", + "reaugmented_dictionary_file_path = os.path.join(\n", + " results_dir, \"ReaugmentedCoclustering.kdic\"\n", + ")\n", + "output_data_table_path = os.path.join(results_dir, \"TransferredSpliceJunction.txt\")\n", "\n", "# Deploy the coclustering with the extended dictionary\n", "print(\"deploy_model with the new coclustering based variables\")\n", @@ -2031,11 +1840,11 @@ " \"SpliceJunction\",\n", " data_table_path,\n", " output_data_table_path,\n", - " additional_data_tables={\"DNA\": secondary_data_table_path},\n", + " additional_data_tables={\"SpliceJunction`DNA\": secondary_data_table_path},\n", ")\n", "\n", "deployed_dictionary_file_path = os.path.join(\n", - " output_dir, \"Transferred_Coclustering.kdic\"\n", + " results_dir, \"Transferred_Coclustering.kdic\"\n", ")\n", "print(\"build_deployed_dictionary to get the new dictionary\")\n", "kh.build_deployed_dictionary(\n", @@ -2066,9 +1875,7 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - "report_file_path = os.path.join(\n", - " \"kh_samples\", \"scenario_prologue\", \"AnalysisResults.khj\"\n", - ")\n", + "results_dir = os.path.join(\"kh_samples\", \"scenario_prologue\")\n", "\n", "# Set the maximum memory \"by hand\" with an scenario prologue\n", "scenario_prologue = \"\"\"\n", @@ -2082,7 +1889,7 @@ " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", - " report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", " scenario_prologue=scenario_prologue,\n", ")" @@ -2109,9 +1916,8 @@ "# Set the file paths\n", "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.kdic\")\n", "data_table_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - "output_dir = os.path.join(\"kh_samples\", \"build_deployed_dictionary\")\n", - "deployed_dictionary_file_path = os.path.join(output_dir, \"SNB_Iris_deployed.kdic\")\n", - "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", + "results_dir = os.path.join(\"kh_samples\", \"build_deployed_dictionary\")\n", + "deployed_dictionary_file_path = os.path.join(results_dir, \"SNB_Iris_deployed.kdic\")\n", "\n", "# Train the predictor\n", "_, modeling_dictionary_file_path = kh.train_predictor(\n", @@ -2119,7 +1925,7 @@ " \"Iris\",\n", " data_table_path,\n", " \"Class\",\n", - " report_file_path,\n", + " results_dir,\n", " max_trees=0,\n", ")\n", "\n", diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 6a15c898..c79fe141 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -167,7 +167,8 @@ "import os\n", "import pandas as pd\n", "from khiops import core as kh\n", - "from khiops.sklearn import KhiopsClassifier, train_test_split_dataset\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", "from sklearn import metrics\n", "\n", "# Load the dataset into pandas dataframes\n", @@ -235,7 +236,8 @@ "import os\n", "import pandas as pd\n", "from khiops import core as kh\n", - "from khiops.sklearn import KhiopsClassifier, train_test_split_dataset\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", "from sklearn import metrics\n", "\n", "# Load the dataset tables into dataframes\n", @@ -243,9 +245,7 @@ "accidents_df = pd.read_csv(os.path.join(accidents_data_dir, \"Accidents.txt\"), sep=\"\\t\")\n", "users_df = pd.read_csv(os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\")\n", "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", - "places_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", low_memory=False\n", - ")\n", + "places_df = pd.read_csv(os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\")\n", "\n", "# Build the multi-table dataset spec (drop the target column \"Gravity\")\n", "X = {\n", @@ -466,7 +466,7 @@ " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", ")\n", "\n", - "# Split the secondary dataframe with the keys of the split root dataframe\n", + "# Split the secondary dataframe with the keys of the splitted root dataframe\n", "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", @@ -706,9 +706,7 @@ "accidents_df = pd.read_csv(os.path.join(accidents_data_dir, \"Accidents.txt\"), sep=\"\\t\")\n", "users_df = pd.read_csv(os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\")\n", "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", - "places_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", low_memory=False\n", - ")\n", + "places_df = pd.read_csv(os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\")\n", "\n", "# Build the multi-table dataset spec (drop the target column \"Gravity\")\n", "X = {\n", @@ -869,7 +867,7 @@ " keep_initial_variables=True,\n", " transform_type_categorical=\"part_id\",\n", " transform_type_numerical=\"part_id\",\n", - " transform_type_pairs=\"part_id\",\n", + " transform_pairs=\"part_id\",\n", ")\n", "khe.fit(X, y)\n", "\n", @@ -972,6 +970,185 @@ "print(X_clusters)\n", "print(\"---\")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multitable_list()`\n\n", + "Trains a KhiopsClassifier using a list dataset specification\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn import metrics\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the root table of the dataset into a pandas dataframe\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + ")\n", + "X = accidents_df.drop(\"Gravity\", axis=1)\n", + "y = accidents_df[\"Gravity\"]\n", + "\n", + "# Split the dataset into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n", + "\n", + "# Load the secondary table of the dataset into a pandas dataframe\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", + "\n", + "# Split the secondary dataframe with the keys of the splitted root dataframe\n", + "X_train_ids = X_train[\"AccidentId\"].to_frame()\n", + "X_test_ids = X_test[\"AccidentId\"].to_frame()\n", + "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", + "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", + "\n", + "# Create the classifier specifying the key column name\n", + "khc = KhiopsClassifier(key=\"AccidentId\")\n", + "\n", + "# Train the classifier\n", + "khc.fit([X_train, X_train_secondary], y_train)\n", + "\n", + "# Predict the class on the test dataset\n", + "y_test_pred = khc.predict([X_test, X_test_secondary])\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probability on the test dataset\n", + "y_test_probas = khc.predict_proba([X_test, X_test_secondary])\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multitable_star_file()`\n\n", + "Trains a `.KhiopsClassifier` with a file path based dataset\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n If you need to handle large datasets that do not easily fit into memory then you\n may use the `~.khiops.core` API directly, which allows to specify file paths\n directly.\n \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn import metrics\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Create output directory\n", + "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_multitable_star_file\")\n", + "if not os.path.exists(\"kh_samples\"):\n", + " os.mkdir(\"kh_samples\")\n", + " os.mkdir(results_dir)\n", + "else:\n", + " if not os.path.exists(results_dir):\n", + " os.mkdir(results_dir)\n", + "\n", + "# Load the root table of the dataset into a pandas dataframe\n", + "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + ")\n", + "\n", + "# Split the root dataframe into train and test\n", + "X_train_main, X_test_main = train_test_split(\n", + " accidents_df, test_size=0.3, random_state=1\n", + ")\n", + "\n", + "# Load the secondary table of the dataset into a pandas dataframe\n", + "vehicles_df = pd.read_csv(\n", + " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", + ")\n", + "\n", + "# Split the secondary dataframe with the keys of the splitted root dataframe\n", + "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", + "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", + "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", + "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", + "\n", + "# Write the train and test dataset sets to disk\n", + "# For the test file we remove the target column from the main table\n", + "X_train_main_path = os.path.join(results_dir, \"X_train_main.txt\")\n", + "X_train_main.to_csv(X_train_main_path, sep=\"\\t\", header=True, index=False)\n", + "X_train_secondary_path = os.path.join(results_dir, \"X_train_secondary.txt\")\n", + "X_train_secondary.to_csv(X_train_secondary_path, sep=\"\\t\", header=True, index=False)\n", + "X_test_main_path = os.path.join(results_dir, \"X_test_main.txt\")\n", + "y_test = X_test_main.sort_values(\"AccidentId\")[\"Gravity\"]\n", + "X_test_main.drop(columns=\"Gravity\").to_csv(\n", + " X_test_main_path, sep=\"\\t\", header=True, index=False\n", + ")\n", + "X_test_secondary_path = os.path.join(results_dir, \"X_test_secondary.txt\")\n", + "X_test_secondary.to_csv(X_test_secondary_path, sep=\"\\t\", header=True, index=False)\n", + "\n", + "# Define the dictionary of train\n", + "X_train = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (X_train_main_path, \"AccidentId\"),\n", + " \"Vehicles\": (X_train_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", + " },\n", + " \"format\": (\"\\t\", True),\n", + "}\n", + "X_test = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (X_test_main_path, \"AccidentId\"),\n", + " \"Vehicles\": (X_test_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", + " },\n", + " \"format\": (\"\\t\", True),\n", + "}\n", + "\n", + "# Create the classifier and fit it\n", + "khc = KhiopsClassifier(output_dir=results_dir)\n", + "khc.fit(X_train, y=\"Gravity\")\n", + "\n", + "# Predict the class in addition to the class probabilities on the test dataset\n", + "y_test_pred_path = khc.predict(X_test)\n", + "y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[\"PredictedGravity\"].head(10))\n", + "print(\"---\")\n", + "\n", + "y_test_probas_path = khc.predict_proba(X_test)\n", + "y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n", + "proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[proba_columns].head(10))\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityLethal\"])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] } ], "metadata": {}, diff --git a/tests/test_khiops_integrations.py b/tests/test_khiops_integrations.py index 902e3070..6d8c90fb 100644 --- a/tests/test_khiops_integrations.py +++ b/tests/test_khiops_integrations.py @@ -8,6 +8,7 @@ import os import platform +import shlex import shutil import stat import subprocess @@ -87,10 +88,8 @@ def test_runner_has_mpiexec_on_linux(self): else: self.skipTest("Skipping test: platform not Ubuntu or Rocky Linux") if openmpi_found: - runner = kh.get_runner() - if not runner.mpi_command_args: - self.fail("MPI support found, but MPI command args not set") - mpiexec_path = runner.mpi_command_args[0] + _ = kh.get_runner() + mpiexec_path = shlex.split(os.environ["KHIOPS_MPI_COMMAND"])[0] self.assertTrue(os.path.exists(mpiexec_path)) self.assertTrue(os.path.isfile(mpiexec_path)) self.assertTrue(os.access(mpiexec_path, os.X_OK)) @@ -174,17 +173,17 @@ def test_runner_environment_initialization(self): To test a real initialization this test should be executed alone. """ # Obtain the current runner - runner = kh.get_runner() + _ = kh.get_runner() # Check that MODL* files as set in the runner exist and are executable - self.assertTrue(os.path.isfile(runner.khiops_path)) - self.assertTrue(os.access(runner.khiops_path, os.X_OK)) - self.assertTrue(os.path.isfile(runner.khiops_coclustering_path)) - self.assertTrue(os.access(runner.khiops_coclustering_path, os.X_OK)) + self.assertTrue(os.path.isfile(os.environ["KHIOPS_PATH"])) + self.assertTrue(os.access(os.environ["KHIOPS_PATH"], os.X_OK)) + self.assertTrue(os.path.isfile(os.environ["KHIOPS_COCLUSTERING_PATH"])) + self.assertTrue(os.access(os.environ["KHIOPS_COCLUSTERING_PATH"], os.X_OK)) # Check that mpiexec is set correctly in the runner: - if runner.mpi_command_args: - mpiexec_path = runner.mpi_command_args[0] + if os.environ["KHIOPS_MPI_COMMAND"]: + mpiexec_path = shlex.split(os.environ["KHIOPS_MPI_COMMAND"])[0] self.assertTrue(os.path.exists(mpiexec_path)) self.assertTrue(os.path.isfile(mpiexec_path)) self.assertTrue(os.access(mpiexec_path, os.X_OK)) From 0419afbbaaeb20b1047bf79fa6b5bc037e265f1c Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Fri, 14 Feb 2025 14:02:59 +0100 Subject: [PATCH 3/4] Run small (but numerous) sklearn tests with 1 core --- tests/test_sklearn.py | 14 ++++++++++++++ tests/test_sklearn_output_types.py | 18 +++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index e36df908..14a00714 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -1685,6 +1685,20 @@ def test_parameter_transfer_coclustering_predict_from_dataframe(self): class KhiopsSklearnEstimatorStandardTests(unittest.TestCase): """Tests to comply with `sklearn.util.estimator_checks.check_estimator`""" + _env_khiops_proc_number = None + + @classmethod + def setUpClass(cls): + # Set the number of processes to 1: Lots of test on small datasets + kh.get_runner() # Just to activate the lazy initialization + cls._env_khiops_proc_number = os.environ["KHIOPS_PROC_NUMBER"] + os.environ["KHIOPS_PROC_NUMBER"] = "1" + + @classmethod + def tearDownClass(cls): + # Restore the original number of processes + os.environ["KHIOPS_PROC_NUMBER"] = cls._env_khiops_proc_number + def test_sklearn_check_estimator(self): # Set the estimators to test # Notes: diff --git a/tests/test_sklearn_output_types.py b/tests/test_sklearn_output_types.py index 572f378e..f4df7656 100644 --- a/tests/test_sklearn_output_types.py +++ b/tests/test_sklearn_output_types.py @@ -5,6 +5,7 @@ # see the "LICENSE.md" file for more details. # ###################################################################################### """Tests for checking the output types of predictors""" +import os import unittest import numpy as np @@ -13,8 +14,8 @@ from sklearn import datasets from sklearn.utils.validation import column_or_1d +from khiops import core as kh from khiops.sklearn.estimators import KhiopsClassifier, KhiopsRegressor -from tests.test_helper import KhiopsTestHelper # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: @@ -48,8 +49,19 @@ def create_iris_mt(): class KhiopsSklearnOutputTypes(unittest.TestCase): """Tests for checking the output types of predictors""" - def setUp(self): - KhiopsTestHelper.skip_expensive_test(self) + _env_khiops_proc_number = None + + @classmethod + def setUpClass(cls): + # Set the number of processes to 1: Lots of test on small datasets + kh.get_runner() # Just to activate the lazy initialization + cls._env_khiops_proc_number = os.environ["KHIOPS_PROC_NUMBER"] + os.environ["KHIOPS_PROC_NUMBER"] = "1" + + @classmethod + def tearDownClass(cls): + # Restore the original number of processes + os.environ["KHIOPS_PROC_NUMBER"] = cls._env_khiops_proc_number def _replace(self, array, replacement_dict): return np.array([replacement_dict[value] for value in array]) From 3bbdc9bfc2ee0d8e686adb8327bc29518e6e7a81 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:46:31 +0100 Subject: [PATCH 4/4] Update CHANGELOG.md --- CHANGELOG.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2cec058..15aadde0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,30 @@ - Example: 10.2.1.4 is the 5th version that supports khiops 10.2.1. - Internals: Changes in *Internals* sections are unlikely to be of interest for data scientists. -## 10.2.4.0 - 2024-12-19 +## Unreleased + +### Changed +- (Internals) The user change of Khiops environment variables without restarting Python or creating + a new runner. + +## 10.3.0.0 - 2025-02-10 + +### Fixed +- (`core`) Dictionary file `.json` extension check in the `khiops.dictionary.read_dictionary_file` +function. + +### Changed +- (`sklearn`) The `train_test_split_dataset` helper has been moved from `khiops.utils` to +`khiops.sklearn`. +- (`sklearn`) The `transform_pairs` parameter of the `KhiopsEncoder` sklearn estimator has been +renamed to `transform_type_pairs`. + +### Removed +- (`sklearn`) The `is_fitted_` estimator attribute. The Scikit-learn `check_is_fitted` function +can be used to test the fitted state of the estimators. +- (`sklearn`) The `n_pairs` parameter of the `KhiopsRegressor` sklearn estimator. It was never +supported. + ### Added - (General) Support for Python 3.13. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy