diff --git a/.editorconfig b/.editorconfig index 0e67d4457..98a73a58e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -5,7 +5,7 @@ end_of_line = lf insert_final_newline = true charset = utf-8 -[*.py] +[*.py, *.rs] indent_style = space indent_size = 4 diff --git a/pgml-extension/.dockerignore b/pgml-extension/.dockerignore index 68bc17f9f..85aadc2fe 100644 --- a/pgml-extension/.dockerignore +++ b/pgml-extension/.dockerignore @@ -158,3 +158,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +pgml_rust/target/ diff --git a/pgml-extension/Dockerfile b/pgml-extension/Dockerfile index 67f9e66d5..d0daf3425 100644 --- a/pgml-extension/Dockerfile +++ b/pgml-extension/Dockerfile @@ -1,10 +1,18 @@ +# FROM rust:1-bullseye AS rust_builder +# COPY pgml_rust /pgml_rust +# WORKDIR /pgml_rust +# RUN apt-get update && apt-get install -y postgresql-13 libpq-dev cmake libclang-dev +# RUN cargo install cargo-pgx +# RUN cargo pgx init +# RUN cargo pgx package + FROM debian:bullseye-slim MAINTAINER docker@postgresml.com RUN apt-get update ARG DEBIAN_FRONTEND=noninteractive ENV TZ=Etc/UTC -RUN apt-get install -y postgresql-plpython3-13 python3 python3-pip postgresql-13 tzdata sudo cmake libpq-dev +RUN apt-get install -y postgresql-plpython3-13 python3 python3-pip postgresql-13 tzdata sudo cmake libpq-dev libclang-dev # Cache this, quicker RUN pip3 install xgboost sklearn diptest torch lightgbm transformers datasets sentencepiece sacremoses sacrebleu rouge diff --git a/pgml-extension/pgml_rust/.cargo/config b/pgml-extension/pgml_rust/.cargo/config new file mode 100644 index 000000000..2b25fcd1d --- /dev/null +++ b/pgml-extension/pgml_rust/.cargo/config @@ -0,0 +1,3 @@ +[build] +# Postgres symbols won't be available until runtime +rustflags = ["-C", "link-args=-Wl,-undefined,dynamic_lookup"] diff --git a/pgml-extension/pgml_rust/.gitignore b/pgml-extension/pgml_rust/.gitignore new file mode 100644 index 000000000..3906c3324 --- /dev/null +++ b/pgml-extension/pgml_rust/.gitignore @@ -0,0 +1,6 @@ +.DS_Store +.idea/ +/target +*.iml +**/*.rs.bk +Cargo.lock diff --git a/pgml-extension/pgml_rust/Cargo.toml b/pgml-extension/pgml_rust/Cargo.toml new file mode 100644 index 000000000..1cbed25f3 --- /dev/null +++ b/pgml-extension/pgml_rust/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "pgml_rust" +version = "0.0.0" +edition = "2021" + +[lib] +crate-type = ["cdylib"] + +[features] +default = ["pg13"] +pg10 = ["pgx/pg10", "pgx-tests/pg10" ] +pg11 = ["pgx/pg11", "pgx-tests/pg11" ] +pg12 = ["pgx/pg12", "pgx-tests/pg12" ] +pg13 = ["pgx/pg13", "pgx-tests/pg13" ] +pg14 = ["pgx/pg14", "pgx-tests/pg14" ] +pg_test = [] + +[dependencies] +pgx = "=0.4.5" +xgboost = { path = "rust-xgboost" } +rustlearn = "0.5" +once_cell = "1" +rand = "0.8" + +[dev-dependencies] +pgx-tests = "=0.4.5" + +[profile.dev] +panic = "unwind" +lto = "thin" + +[profile.release] +panic = "unwind" +opt-level = 3 +lto = "fat" +codegen-units = 1 diff --git a/pgml-extension/pgml_rust/README.md b/pgml-extension/pgml_rust/README.md new file mode 100644 index 000000000..14414e7cf --- /dev/null +++ b/pgml-extension/pgml_rust/README.md @@ -0,0 +1,23 @@ +# Rust meet PostgresML + +Here we have some POC code to use Rust for PostgresML. + +## Dependencies + +All dependencies are vendored. I downloaded XGBoost 1.62 and all its submodules. We're also using the `master` branch of `xgboost` Rust crate. + +If you haven't already, install: + +- `cmake` +- `libclang-dev` + +## Local development + +1. `cargo install pgx` +2. `cargo pgx run` +3. `DROP EXTENSION IF EXISTS pgml_rust;` +4. `CREATE EXTENSION pgml_rust;` +5. `SELECT pgml_train('pgml.diabetes', ARRAY['age', 'sex'], 'target');` +6. `SELECT * FROM pgml_predict(ARRAY[1, 5.0]);` + +Lots of todos, but still a decent PoC. diff --git a/pgml-extension/pgml_rust/pgml_rust.control b/pgml-extension/pgml_rust/pgml_rust.control new file mode 100644 index 000000000..05223ba7c --- /dev/null +++ b/pgml-extension/pgml_rust/pgml_rust.control @@ -0,0 +1,5 @@ +comment = 'pgml_rust: Created by pgx' +default_version = '@CARGO_VERSION@' +module_pathname = '$libdir/pgml_rust' +relocatable = false +superuser = false diff --git a/pgml-extension/pgml_rust/rust-xgboost/.gitignore b/pgml-extension/pgml_rust/rust-xgboost/.gitignore new file mode 100644 index 000000000..16355dadd --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/.gitignore @@ -0,0 +1,12 @@ +# Generated by Cargo +# will have compiled files and executables +/target/ +/examples/*/target/ +/xgboost-sys/target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk diff --git a/pgml-extension/pgml_rust/rust-xgboost/.gitmodules b/pgml-extension/pgml_rust/rust-xgboost/.gitmodules new file mode 100644 index 000000000..cbbe4a522 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/.gitmodules @@ -0,0 +1,4 @@ +[submodule "xgboost-sys/xgboost"] + path = xgboost-sys/xgboost + url = https://github.com/davechallis/xgboost + branch = master diff --git a/pgml-extension/pgml_rust/rust-xgboost/.travis.yml b/pgml-extension/pgml_rust/rust-xgboost/.travis.yml new file mode 100644 index 000000000..c28ef0f96 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/.travis.yml @@ -0,0 +1,22 @@ +language: rust + +os: + - linux + - osx + +rust: + - stable + - nightly +matrix: + allow_failures: + - rust: nightly + fast_finish: true + +cache: cargo + +script: + - cd xgboost-sys && cargo test --verbose --all + - cd .. && cargo test --verbose --all + - cd examples/basic && cargo run + - cd ../custom_objective && cargo run + - cd ../generalised_linear_model && cargo run diff --git a/pgml-extension/pgml_rust/rust-xgboost/CHANGELOG.md b/pgml-extension/pgml_rust/rust-xgboost/CHANGELOG.md new file mode 100644 index 000000000..83abe1147 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/CHANGELOG.md @@ -0,0 +1,3 @@ +# 0.1.4 (2019-03-05) + +* `Booster::load_buffer` method added (thanks [jonathanstrong](https://github.com/jonathanstrong)) diff --git a/pgml-extension/pgml_rust/rust-xgboost/Cargo.toml b/pgml-extension/pgml_rust/rust-xgboost/Cargo.toml new file mode 100644 index 000000000..465ee70a0 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "xgboost" +version = "0.2.0" +authors = ["Dave Challis "] +license = "MIT" +repository = "https://github.com/davechallis/rust-xgboost" +homepage = "https://github.com/davechallis/rust-xgboost" +description = "Machine learning using XGBoost" +documentation = "https://docs.rs/xgboost" +readme = "README.md" + +[dependencies] +xgboost-sys = { path = "xgboost-sys" } +libc = "0.2" +derive_builder = "0.5" +log = "0.4" +tempfile = "3.0" +indexmap = "1.0" diff --git a/pgml-extension/pgml_rust/rust-xgboost/LICENSE b/pgml-extension/pgml_rust/rust-xgboost/LICENSE new file mode 100644 index 000000000..55bea104e --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Dave Challis + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/pgml-extension/pgml_rust/rust-xgboost/README.md b/pgml-extension/pgml_rust/rust-xgboost/README.md new file mode 100644 index 000000000..009f86925 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/README.md @@ -0,0 +1,95 @@ +# rust-xgboost + +[![Travis Build Status](https://travis-ci.com/davechallis/rust-xgboost.svg?branch=master)](https://travis-ci.com/davechallis/rust-xgboost) +[![Documentation link](https://docs.rs/xgboost/badge.svg)](https://docs.rs/xgboost/badge.svg) + +Rust bindings for the [XGBoost](https://xgboost.ai) gradient boosting library. + +* [Documentation](https://docs.rs/xgboost) + +Basic usage example: + +```rust +extern crate xgboost; + +use xgboost::{parameters, DMatrix, Booster}; + +fn main() { + // training matrix with 5 training examples and 3 features + let x_train = &[1.0, 1.0, 1.0, + 1.0, 1.0, 0.0, + 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, + 1.0, 1.0, 1.0]; + let num_rows = 5; + let y_train = &[1.0, 1.0, 1.0, 0.0, 1.0]; + + // convert training data into XGBoost's matrix format + let mut dtrain = DMatrix::from_dense(x_train, num_rows).unwrap(); + + // set ground truth labels for the training matrix + dtrain.set_labels(y_train).unwrap(); + + // test matrix with 1 row + let x_test = &[0.7, 0.9, 0.6]; + let num_rows = 1; + let y_test = &[1.0]; + let mut dtest = DMatrix::from_dense(x_test, num_rows).unwrap(); + dtest.set_labels(y_test).unwrap(); + + // configure objectives, metrics, etc. + let learning_params = parameters::learning::LearningTaskParametersBuilder::default() + .objective(parameters::learning::Objective::BinaryLogistic) + .build().unwrap(); + + // configure the tree-based learning model's parameters + let tree_params = parameters::tree::TreeBoosterParametersBuilder::default() + .max_depth(2) + .eta(1.0) + .build().unwrap(); + + // overall configuration for Booster + let booster_params = parameters::BoosterParametersBuilder::default() + .booster_type(parameters::BoosterType::Tree(tree_params)) + .learning_params(learning_params) + .verbose(true) + .build().unwrap(); + + // specify datasets to evaluate against during training + let evaluation_sets = &[(&dtrain, "train"), (&dtest, "test")]; + + // overall configuration for training/evaluation + let params = parameters::TrainingParametersBuilder::default() + .dtrain(&dtrain) // dataset to train with + .boost_rounds(2) // number of training iterations + .booster_params(booster_params) // model parameters + .evaluation_sets(Some(evaluation_sets)) // optional datasets to evaluate against in each iteration + .build().unwrap(); + + // train model, and print evaluation data + let bst = Booster::train(¶ms).unwrap(); + + println!("{:?}", bst.predict(&dtest).unwrap()); +} +``` + +See the [examples](https://github.com/davechallis/rust-xgboost/tree/master/examples) directory for +more detailed examples of different features. + +## Status + +Currently in a very early stage of development, so the API is changing as usability issues occur, +or new features are supported. + +Builds against XGBoost 0.81. + +### Platforms + +Tested: + +* Linux +* Mac OS + +Unsupported: + +* Windows diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/README.md b/pgml-extension/pgml_rust/rust-xgboost/examples/README.md new file mode 100644 index 000000000..fc1965f27 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/README.md @@ -0,0 +1,6 @@ +# xgboost feature examples + +* [Basic usage](basic/src/main.rs) +* [Custom objective and evaluation functions](custom_objective/src/main.rs) +* [Generalised linear model](generalised_linear_model/src/main.rs) +* [Multiclass classification](multiclass_classification/src/main.rs) diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/basic/Cargo.toml b/pgml-extension/pgml_rust/rust-xgboost/examples/basic/Cargo.toml new file mode 100644 index 000000000..d8cbd2894 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/basic/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "xgboost-basic-example" +version = "0.1.0" +authors = ["Dave Challis "] +publish = false + +[dependencies] +xgboost = { path = "../../" } +sprs = "0.11" +log = "0.4" +env_logger = "0.5" diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/basic/src/main.rs b/pgml-extension/pgml_rust/rust-xgboost/examples/basic/src/main.rs new file mode 100644 index 000000000..2e8955ec7 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/basic/src/main.rs @@ -0,0 +1,129 @@ +extern crate xgboost; +extern crate sprs; +extern crate env_logger; + +use std::io::{BufRead, BufReader}; +use std::fs::File; +use xgboost::{parameters, DMatrix, Booster}; + +fn main() { + // initialise logging, run with e.g. RUST_LOG=xgboost=debug to see more details + env_logger::init(); + + // load train and test matrices from text files (in LibSVM format). + println!("Loading train and test matrices..."); + let dtrain = DMatrix::load("../../xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + println!("Train matrix: {}x{}", dtrain.num_rows(), dtrain.num_cols()); + let dtest = DMatrix::load("../../xgboost-sys/xgboost/demo/data/agaricus.txt.test").unwrap(); + println!("Test matrix: {}x{}", dtest.num_rows(), dtest.num_cols()); + + // configure objectives, metrics, etc. + let learning_params = parameters::learning::LearningTaskParametersBuilder::default() + .objective(parameters::learning::Objective::BinaryLogistic) + .build().unwrap(); + + // configure the tree-based learning model's parameters + let tree_params = parameters::tree::TreeBoosterParametersBuilder::default() + .max_depth(2) + .eta(1.0) + .build().unwrap(); + + // overall configuration for Booster + let booster_params = parameters::BoosterParametersBuilder::default() + .booster_type(parameters::BoosterType::Tree(tree_params)) + .learning_params(learning_params) + .verbose(true) + .build().unwrap(); + + // specify datasets to evaluate against during training + let evaluation_sets = [(&dtest, "test"), (&dtrain, "train")]; + + // overall configuration for training/evaluation + let training_params = parameters::TrainingParametersBuilder::default() + .dtrain(&dtrain) // dataset to train with + .boost_rounds(2) // number of training iterations + .booster_params(booster_params) // model parameters + .evaluation_sets(Some(&evaluation_sets)) // optional datasets to evaluate against in each iteration + .build().unwrap(); + + // train booster model, and print evaluation metrics + println!("\nTraining tree booster..."); + let booster = Booster::train(&training_params).unwrap(); + + // get predictions probabilities for given matrix + let preds = booster.predict(&dtest).unwrap(); + + // get predicted labels for each test example (i.e. 0 or 1) + println!("\nChecking predictions..."); + let labels = dtest.get_labels().unwrap(); + println!("First 3 predicted labels: {} {} {}", labels[0], labels[1], labels[2]); + + // print error rate + let num_correct: usize = preds.iter() + .map(|&v| if v > 0.5 { 1 } else { 0 }) + .sum(); + println!("error={} ({}/{} correct)", num_correct as f32 / preds.len() as f32, num_correct, preds.len()); + + // save and load model file + println!("\nSaving and loading Booster model..."); + booster.save("xgb.model").unwrap(); + let booster = Booster::load("xgb.model").unwrap(); + let preds2 = booster.predict(&dtest).unwrap(); + assert_eq!(preds, preds2); + + // save and load data matrix file + println!("\nSaving and loading matrix data..."); + dtest.save("test.dmat").unwrap(); + let dtest2 = DMatrix::load("test.dmat").unwrap(); + assert_eq!(booster.predict(&dtest2).unwrap(), preds); + + // error handling example + println!("\nError message example..."); + let result = Booster::load("/does/not/exist"); + match result { + Ok(_booster) => (), + Err(err) => println!("Got expected error: {}", err), + } + + // sparse matrix usage + println!("\nSparse matrix construction..."); + + // f32 label for each row of data + let mut labels = Vec::new(); + + // construct sparse matrix in triplet format, then convert to CSR/CSC later + let mut rows = Vec::new(); + let mut cols = Vec::new(); + let mut data = Vec::new(); + + let reader = BufReader::new(File::open("../../xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap()); + let mut current_row = 0; + for line in reader.lines() { + let line = line.unwrap(); + let sample: Vec<&str> = line.split_whitespace().collect(); + labels.push(sample[0].parse::().unwrap()); + + for entry in &sample[1..] { + let pair: Vec<&str> = entry.split(':').collect(); + rows.push(current_row); + cols.push(pair[0].parse::().unwrap()); + data.push(pair[1].parse::().unwrap()); + } + + current_row += 1; + } + + // work out size of sparse matrix from max row/col values + let shape = ((*rows.iter().max().unwrap() + 1) as usize, + (*cols.iter().max().unwrap() + 1) as usize); + let num_col = Some((*cols.iter().max().unwrap() + 1) as usize); + let triplet_mat = sprs::TriMatBase::from_triplets(shape, rows, cols, data); + let csr_mat = triplet_mat.to_csr(); + + let indices: Vec = csr_mat.indices().into_iter().map(|i| *i as usize).collect(); + let mut dtrain = DMatrix::from_csr(csr_mat.indptr().raw_storage(), &indices, csr_mat.data(), num_col).unwrap(); + dtrain.set_labels(&labels).unwrap(); + + let training_params = parameters::TrainingParametersBuilder::default().dtrain(&dtrain).build().unwrap(); + let _ = Booster::train(&training_params).unwrap(); +} diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/custom_objective/Cargo.toml b/pgml-extension/pgml_rust/rust-xgboost/examples/custom_objective/Cargo.toml new file mode 100644 index 000000000..415ad4a75 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/custom_objective/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "xgboost-custom-objective-example" +version = "0.1.0" +authors = ["Dave Challis "] +publish = false + +[dependencies] +xgboost = { path = "../../" } +ndarray = "0.11" diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/custom_objective/src/main.rs b/pgml-extension/pgml_rust/rust-xgboost/examples/custom_objective/src/main.rs new file mode 100644 index 000000000..707f037db --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/custom_objective/src/main.rs @@ -0,0 +1,79 @@ +extern crate xgboost; +extern crate ndarray; + +use xgboost::{parameters, DMatrix, Booster}; + +fn main() { + // load train and test matrices from text files (in LibSVM format) + println!("Custom objective example..."); + let dtrain = DMatrix::load("../../xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + let dtest = DMatrix::load("../../xgboost-sys/xgboost/demo/data/agaricus.txt.test").unwrap(); + + // specify datasets to evaluate against during training + let evaluation_sets = [(&dtest, "test"), (&dtrain, "train")]; + + // define custom objective function + fn log_reg_obj(preds: &[f32], dtrain: &DMatrix) -> (Vec, Vec) { + let mut preds = ndarray::Array1::from_vec(preds.to_vec()); + preds.map_inplace(|x| *x = (-*x).exp()); + preds = 1.0 / (1.0 + preds); + + let labels = ndarray::Array1::from_vec(dtrain.get_labels().unwrap().to_vec()); + let gradient = &preds - &labels; + let hessian = &preds * &(1.0 - &preds); + + (gradient.to_vec(), hessian.to_vec()) + } + + // define custom evaluation function + fn eval_error(preds: &[f32], dtrain: &DMatrix) -> f32 { + let labels = dtrain.get_labels().unwrap(); + let preds = ndarray::Array1::from_vec(preds.to_vec()); + let mut num_incorrect = 0; + for (label, pred) in labels.iter().zip(preds.iter()) { + let pred = if *pred > 0.0 { 1.0 } else { 0.0 }; + if pred != *label { + num_incorrect += 1; + } + } + num_incorrect as f32 / labels.len() as f32 + } + + let tree_params = parameters::tree::TreeBoosterParametersBuilder::default() + .max_depth(2) + .eta(1.0) + .build().unwrap(); + + // overall configuration for Booster + let booster_params = parameters::BoosterParametersBuilder::default() + .learning_params(parameters::learning::LearningTaskParameters::default()) + .booster_type(parameters::BoosterType::Tree(tree_params)) + .build().unwrap(); + + let training_params = parameters::TrainingParametersBuilder::default() + .dtrain(&dtrain) + .booster_params(booster_params) + .boost_rounds(2) + .evaluation_sets(Some(&evaluation_sets)) + .custom_objective_fn(Some(log_reg_obj)) + .custom_evaluation_fn(Some(eval_error)) + .build().unwrap(); + + // train booster model, and print evaluation metrics + println!("\nTraining tree booster..."); + let bst = Booster::train(&training_params).unwrap(); + + // get predictions probabilities for given matrix + let preds = bst.predict(&dtest).unwrap(); + + // get predicted labels for each test example (i.e. 0 or 1) + println!("\nChecking predictions..."); + let labels = dtest.get_labels().unwrap(); + println!("First 3 predicated labels: {} {} {}", labels[0], labels[1], labels[2]); + + // print error rate + let num_correct: usize = preds.iter() + .map(|&v| if v > 0.5 { 1 } else { 0 }) + .sum(); + println!("error={} ({}/{} correct)", num_correct as f32 / preds.len() as f32, num_correct, preds.len()); +} diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/generalised_linear_model/Cargo.toml b/pgml-extension/pgml_rust/rust-xgboost/examples/generalised_linear_model/Cargo.toml new file mode 100644 index 000000000..cd75ddded --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/generalised_linear_model/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "xgboost-generalised-linear-model-example" +version = "0.1.0" +authors = ["Dave Challis "] +publish = false + +[dependencies] +xgboost = { path = "../../" } +ndarray = "0.11" +log = "0.4" +env_logger = "0.5" \ No newline at end of file diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/generalised_linear_model/src/main.rs b/pgml-extension/pgml_rust/rust-xgboost/examples/generalised_linear_model/src/main.rs new file mode 100644 index 000000000..a34974c0e --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/generalised_linear_model/src/main.rs @@ -0,0 +1,65 @@ +//! Example of how to fit a generalised linear model in XGBoost. + +extern crate xgboost; +extern crate ndarray; +extern crate env_logger; + +use xgboost::{parameters, DMatrix, Booster}; + +fn main() { + // initialise logging, run with e.g. RUST_LOG=xgboost=debug to see more details + env_logger::init(); + + // load train and test matrices from text files (in LibSVM format) + println!("Custom objective example..."); + let dtrain = DMatrix::load("../../xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + let dtest = DMatrix::load("../../xgboost-sys/xgboost/demo/data/agaricus.txt.test").unwrap(); + + // configure objectives, metrics, etc. + let learning_params = parameters::learning::LearningTaskParametersBuilder::default() + .objective(parameters::learning::Objective::BinaryLogistic) + .build().unwrap(); + + // configure linear model parameters + let linear_params = parameters::linear::LinearBoosterParametersBuilder::default() + .alpha(0.0001) + .lambda(1.0) + .build().unwrap(); + + // overall configuration for Booster + let booster_params = parameters::BoosterParametersBuilder::default() + .learning_params(learning_params) + .booster_type(parameters::BoosterType::Linear(linear_params)) + .build().unwrap(); + + // Specify datasets to evaluate against during training + let evaluation_sets = [(&dtest, "test"), (&dtrain, "train")]; + + let training_params = parameters::TrainingParametersBuilder::default() + .dtrain(&dtrain) + .boost_rounds(4) + .booster_params(booster_params) + .evaluation_sets(Some(&evaluation_sets)) + .build().unwrap(); + + // Train booster model, and print evaluation metrics + println!("\nTraining tree booster..."); + let bst = Booster::train(&training_params).unwrap(); + + // Get predictions probabilities for given matrix (as ndarray::Array1) + let preds = bst.predict(&dtest).unwrap(); + + // Get predicted labels for each test example (0.0 or 1.0 in this case) + let labels = dtest.get_labels().unwrap(); + + // Print error rate + let mut num_errors = 0; + for (pred, label) in preds.iter().zip(labels) { + let pred = if *pred > 0.5 { 1.0 } else { 0.0 }; + if pred != *label { + num_errors += 1; + } + } + println!("error={} ({}/{} correct)", + num_errors as f32 / preds.len() as f32, preds.len() - num_errors, preds.len()); +} diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/multiclass_classification/Cargo.toml b/pgml-extension/pgml_rust/rust-xgboost/examples/multiclass_classification/Cargo.toml new file mode 100644 index 000000000..63984374b --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/multiclass_classification/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "xgboost-multiclass-classification-example" +version = "0.1.0" +authors = ["Dave Challis "] +publish = false + +[dependencies] +xgboost = { path = "../../" } +log = "0.4" +env_logger = "0.5" +reqwest = { version = "0.11", features = ["blocking"] } diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/multiclass_classification/src/main.rs b/pgml-extension/pgml_rust/rust-xgboost/examples/multiclass_classification/src/main.rs new file mode 100644 index 000000000..7bfa93d63 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/multiclass_classification/src/main.rs @@ -0,0 +1,136 @@ +extern crate xgboost; +extern crate reqwest; +extern crate env_logger; +#[macro_use] +extern crate log; + +use std::path::Path; +use std::io::{BufRead, BufReader, BufWriter}; +use std::fs::File; +use xgboost::{DMatrix, Booster}; +use xgboost::parameters::{self, tree, learning::Objective}; + + + +fn main() { + // initialise logging, run with e.g. RUST_LOG=xgboost_multiclass_classification_example=debug + env_logger::init(); + + // download training data, if not already present locally + download_dataset("dermatology.data"); + + // load train and test matrices from text files (in LibSVM format). + let (dtrain, dtest) = load_train_test_dmats("dermatology.data"); + + // evaluate against both datasets during training + let eval_sets = &[(&dtrain, "train"), (&dtest, "test")]; + + // configure learning objective to use multiclass softmax with 6 classes + let learning_params = parameters::learning::LearningTaskParametersBuilder::default() + .objective(Objective::MultiSoftmax(6)) + .build().unwrap(); + + // configure tree gradient boosting parameters + let tree_params = tree::TreeBoosterParametersBuilder::default() + .eta(0.1) + .max_depth(6) + .build().unwrap(); + + // configure booster + let booster_params = parameters::BoosterParametersBuilder::default() + .booster_type(parameters::BoosterType::Tree(tree_params)) + .learning_params(learning_params) + .threads(Some(4)) + .build().unwrap(); + + // configure the training run + let training_params = parameters::TrainingParametersBuilder::default() + .dtrain(&dtrain) + .booster_params(booster_params) + .boost_rounds(5) + .evaluation_sets(Some(eval_sets)) + .build().unwrap(); + + // train a new booster model with given parameters, printing results on evaluation sets + let booster = Booster::train(&training_params).unwrap(); + + let y_true = dtest.get_labels().unwrap(); + let y_pred = booster.predict(&dtest).unwrap(); + let num_errors: u32 = y_true.iter() + .zip(y_pred.iter()) + .map(|(y1, y2)| if y1 != y2 { 1 } else { 0 }) + .sum(); + let error_rate = num_errors as f32 / y_true.len() as f32; + println!("Test error using softmax: {}", error_rate); +} + +fn download_dataset>(dst: P) { + let url = "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data"; + let dst = dst.as_ref(); + if dst.exists() { + debug!("Training dataset '{}' found", dst.display()); + return; + } + + debug!("Fetching training dataset from {}", url); + let mut response = reqwest::blocking::get(url).expect("failed to download training set data"); + + let file = File::create(dst).expect(&format!("failed to create file {}", dst.display())); + let mut writer = BufWriter::new(file); + response.copy_to(&mut writer).expect(&format!("failed to write to {}", dst.display())); +} + +fn load_train_test_dmats>(src: P) -> (DMatrix, DMatrix) { + let src = src.as_ref(); + let file = File::open(src).expect(&format!("failed to open {}", src.display())); + let reader = BufReader::new(file); + + let mut x: Vec> = Vec::new(); + let mut y: Vec = Vec::new(); + for line in reader.lines() { + let line = line.unwrap(); + let cols: Vec = line.split(',') + .enumerate() + .map(|(col_num, value)| { + match col_num { + // assign value to column which can contain missing data + 33 => if value == "?" { 1.0 } else { 0.0 }, + + // convert class number from string -> zero based class ID float + 34 => value.parse::().unwrap() - 1.0, + + // convert column values from string -> float + _ => value.parse::().unwrap() + } + }) + .collect(); + + // skip column 33 + x.push(cols[0..33].to_vec()); + + // final column contains class + y.push(cols[34]); + } + + let num_rows = x.len(); + let num_cols = x[0].len(); + + let train_size = (0.7 * num_rows as f32) as usize; + let test_size = num_rows - train_size; + + debug!("Parsed {}x{} matrix from dataset", num_rows, num_cols); + + // flatten into 1D vector + let x_train: Vec = x[0..train_size].into_iter() + .flat_map(|row| row.iter().cloned()) + .collect(); + let mut dtrain = DMatrix::from_dense(&x_train, train_size).unwrap(); + dtrain.set_labels(&y[0..train_size]).unwrap(); + let x_test: Vec = x[train_size..].into_iter() + .flat_map(|row| row.iter().cloned()) + .collect(); + let mut dtest = DMatrix::from_dense(&x_test, test_size).unwrap(); + dtest.set_labels(&y[train_size..]).unwrap(); + + (dtrain, dtest) +} diff --git a/pgml-extension/pgml_rust/rust-xgboost/examples/runall.sh b/pgml-extension/pgml_rust/rust-xgboost/examples/runall.sh new file mode 100755 index 000000000..732d52d80 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/examples/runall.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +examples=(basic custom_objective generalised_linear_model multiclass_classification) + +for example in "${examples[@]}" +do + echo "---------- Running example: $example ---------" + (cd $example && cargo run) + echo +done diff --git a/pgml-extension/pgml_rust/rust-xgboost/src/booster.rs b/pgml-extension/pgml_rust/rust-xgboost/src/booster.rs new file mode 100644 index 000000000..1f2dbac33 --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/src/booster.rs @@ -0,0 +1,1045 @@ +use libc; +use std::{fs::File, fmt, slice, ffi, ptr}; +use std::str::FromStr; +use std::io::{self, Write, BufReader, BufRead}; +use std::collections::{BTreeMap, HashMap}; +use std::path::{Path, PathBuf}; +use error::XGBError; +use dmatrix::DMatrix; +use std::os::unix::ffi::OsStrExt; + +use xgboost_sys; +use tempfile; +use indexmap::IndexMap; + +use super::XGBResult; +use parameters::{BoosterParameters, TrainingParameters}; + +pub type CustomObjective = fn(&[f32], &DMatrix) -> (Vec, Vec); + +/// Used to control the return type of predictions made by C Booster API. +enum PredictOption { + OutputMargin, + PredictLeaf, + PredictContribitions, + //ApproximateContributions, + PredictInteractions, +} + +impl PredictOption { + /// Convert list of options into a bit mask. + fn options_as_mask(options: &[PredictOption]) -> i32 { + let mut option_mask = 0x00; + for option in options { + let value = match *option { + PredictOption::OutputMargin => 0x01, + PredictOption::PredictLeaf => 0x02, + PredictOption::PredictContribitions => 0x04, + //PredictOption::ApproximateContributions => 0x08, + PredictOption::PredictInteractions => 0x10, + }; + option_mask |= value; + } + + option_mask + } +} + +/// Core model in XGBoost, containing functions for training, evaluating and predicting. +/// +/// Usually created through the [`train`](struct.Booster.html#method.train) function, which +/// creates and trains a Booster in a single call. +/// +/// For more fine grained usage, can be created using [`new`](struct.Booster.html#method.new) or +/// [`new_with_cached_dmats`](struct.Booster.html#method.new_with_cached_dmats), then trained by calling +/// [`update`](struct.Booster.html#method.update) or [`update_custom`](struct.Booster.html#method.update_custom) +/// in a loop. +pub struct Booster { + handle: xgboost_sys::BoosterHandle, +} + +impl Booster { + /// Create a new Booster model with given parameters. + /// + /// This model can then be trained using calls to update/boost as appropriate. + /// + /// The [`train`](struct.Booster.html#method.train) function is often a more convenient way of constructing, + /// training and evaluating a Booster in a single call. + pub fn new(params: &BoosterParameters) -> XGBResult { + Self::new_with_cached_dmats(params, &[]) + } + + /// Create a new Booster model with given parameters and list of DMatrix to cache. + /// + /// Cached DMatrix can sometimes be used internally by XGBoost to speed up certain operations. + pub fn new_with_cached_dmats(params: &BoosterParameters, dmats: &[&DMatrix]) -> XGBResult { + let mut handle = ptr::null_mut(); + // TODO: check this is safe if any dmats are freed + let s: Vec = dmats.iter().map(|x| x.handle).collect(); + xgb_call!(xgboost_sys::XGBoosterCreate(s.as_ptr(), dmats.len() as u64, &mut handle))?; + + let mut booster = Booster { handle }; + booster.set_params(params)?; + Ok(booster) + } + + /// Save this Booster as a binary file at given path. + pub fn save>(&self, path: P) -> XGBResult<()> { + debug!("Writing Booster to: {}", path.as_ref().display()); + let fname = ffi::CString::new(path.as_ref().as_os_str().as_bytes()).unwrap(); + xgb_call!(xgboost_sys::XGBoosterSaveModel(self.handle, fname.as_ptr())) + } + + /// Load a Booster from a binary file at given path. + pub fn load>(path: P) -> XGBResult { + debug!("Loading Booster from: {}", path.as_ref().display()); + + // gives more control over error messages, avoids stack trace dump from C++ + if !path.as_ref().exists() { + return Err(XGBError::new(format!("File not found: {}", path.as_ref().display()))); + } + + let fname = ffi::CString::new(path.as_ref().as_os_str().as_bytes()).unwrap(); + let mut handle = ptr::null_mut(); + xgb_call!(xgboost_sys::XGBoosterCreate(ptr::null(), 0, &mut handle))?; + xgb_call!(xgboost_sys::XGBoosterLoadModel(handle, fname.as_ptr()))?; + Ok(Booster { handle }) + } + + /// Load a Booster directly from a buffer. + pub fn load_buffer(bytes: &[u8]) -> XGBResult { + debug!("Loading Booster from buffer (length = {})", bytes.len()); + + let mut handle = ptr::null_mut(); + xgb_call!(xgboost_sys::XGBoosterCreate(ptr::null(), 0, &mut handle))?; + xgb_call!(xgboost_sys::XGBoosterLoadModelFromBuffer(handle, bytes.as_ptr() as *const _, bytes.len() as u64))?; + Ok(Booster { handle }) + } + + /// Convenience function for creating/training a new Booster. + /// + /// This does the following: + /// + /// 1. create a new Booster model with given parameters + /// 2. train the model with given DMatrix + /// 3. print out evaluation results for each training round + /// 4. return trained Booster + /// + /// * `params` - training parameters + /// * `dtrain` - matrix to train Booster with + /// * `num_boost_round` - number of training iterations + /// * `eval_sets` - list of datasets to evaluate after each boosting round + pub fn train(params: &TrainingParameters) -> XGBResult { + let cached_dmats = { + let mut dmats = vec![params.dtrain]; + if let Some(eval_sets) = params.evaluation_sets { + for (dmat, _) in eval_sets { + dmats.push(*dmat); + } + } + dmats + }; + + let mut bst = Booster::new_with_cached_dmats(¶ms.booster_params, &cached_dmats)?; + //let num_parallel_tree = 1; + + // load distributed code checkpoint from rabit + let version = bst.load_rabit_checkpoint()?; + debug!("Loaded Rabit checkpoint: version={}", version); + assert!(unsafe { xgboost_sys::RabitGetWorldSize() != 1 || version == 0 }); + + let _rank = unsafe { xgboost_sys::RabitGetRank() }; + let start_iteration = version / 2; + //let mut nboost = start_iteration; + + for i in start_iteration..params.boost_rounds as i32 { + // distributed code: need to resume to this point + // skip first update if a recovery step + if version % 2 == 0 { + if let Some(objective_fn) = params.custom_objective_fn { + debug!("Boosting in round: {}", i); + bst.update_custom(params.dtrain, objective_fn)?; + } else { + debug!("Updating in round: {}", i); + bst.update(params.dtrain, i)?; + } + bst.save_rabit_checkpoint()?; + } + + assert!(unsafe { xgboost_sys::RabitGetWorldSize() == 1 || version == xgboost_sys::RabitVersionNumber() }); + + //nboost += 1; + + if let Some(eval_sets) = params.evaluation_sets { + let mut dmat_eval_results = bst.eval_set(eval_sets, i)?; + + if let Some(eval_fn) = params.custom_evaluation_fn { + let eval_name = "custom"; + for (dmat, dmat_name) in eval_sets { + let margin = bst.predict_margin(dmat)?; + let eval_result = eval_fn(&margin, dmat); + let eval_results = dmat_eval_results.entry(eval_name.to_string()) + .or_insert_with(IndexMap::new); + eval_results.insert(dmat_name.to_string(), eval_result); + } + } + + // convert to map of eval_name -> (dmat_name -> score) + let mut eval_dmat_results = BTreeMap::new(); + for (dmat_name, eval_results) in &dmat_eval_results { + for (eval_name, result) in eval_results { + let dmat_results = eval_dmat_results.entry(eval_name).or_insert_with(BTreeMap::new); + dmat_results.insert(dmat_name, result); + } + } + + print!("[{}]", i); + for (eval_name, dmat_results) in eval_dmat_results { + for (dmat_name, result) in dmat_results { + print!("\t{}-{}:{}", dmat_name, eval_name, result); + } + } + println!(); + } + } + + Ok(bst) + } + + /// Update this Booster's parameters. + pub fn set_params(&mut self, p: &BoosterParameters) -> XGBResult<()> { + for (key, value) in p.as_string_pairs() { + debug!("Setting parameter: {}={}", &key, &value); + self.set_param(&key, &value)?; + } + Ok(()) + } + + /// Update this model by training it for one round with given training matrix. + /// + /// Uses XGBoost's objective function that was specificed in this Booster's learning objective parameters. + /// + /// * `dtrain` - matrix to train the model with for a single iteration + /// * `iteration` - current iteration number + pub fn update(&mut self, dtrain: &DMatrix, iteration: i32) -> XGBResult<()> { + xgb_call!(xgboost_sys::XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle)) + } + + /// Update this model by training it for one round with a custom objective function. + pub fn update_custom(&mut self, dtrain: &DMatrix, objective_fn: CustomObjective) -> XGBResult<()> { + let pred = self.predict(dtrain)?; + let (gradient, hessian) = objective_fn(&pred.to_vec(), dtrain); + self.boost(dtrain, &gradient, &hessian) + } + + /// Update this model by directly specifying the first and second order gradients. + /// + /// This is typically used instead of `update` when using a customised loss function. + /// + /// * `dtrain` - matrix to train the model with for a single iteration + /// * `gradient` - first order gradient + /// * `hessian` - second order gradient + fn boost(&mut self, dtrain: &DMatrix, gradient: &[f32], hessian: &[f32]) -> XGBResult<()> { + if gradient.len() != hessian.len() { + let msg = format!("Mismatch between length of gradient and hessian arrays ({} != {})", + gradient.len(), hessian.len()); + return Err(XGBError::new(msg)); + } + assert_eq!(gradient.len(), hessian.len()); + + // TODO: _validate_feature_names + let mut grad_vec = gradient.to_vec(); + let mut hess_vec = hessian.to_vec(); + xgb_call!(xgboost_sys::XGBoosterBoostOneIter(self.handle, + dtrain.handle, + grad_vec.as_mut_ptr(), + hess_vec.as_mut_ptr(), + grad_vec.len() as u64)) + } + + fn eval_set(&self, evals: &[(&DMatrix, &str)], iteration: i32) -> XGBResult>> { + let (dmats, names) = { + let mut dmats = Vec::with_capacity(evals.len()); + let mut names = Vec::with_capacity(evals.len()); + for (dmat, name) in evals { + dmats.push(dmat); + names.push(*name); + } + (dmats, names) + }; + assert_eq!(dmats.len(), names.len()); + + let mut s: Vec = dmats.iter().map(|x| x.handle).collect(); + + // build separate arrays of C strings and pointers to them to ensure they live long enough + let mut evnames: Vec = Vec::with_capacity(names.len()); + let mut evptrs: Vec<*const libc::c_char> = Vec::with_capacity(names.len()); + + for name in &names { + let cstr = ffi::CString::new(*name).unwrap(); + evptrs.push(cstr.as_ptr()); + evnames.push(cstr); + } + + // shouldn't be necessary, but guards against incorrect array sizing + evptrs.shrink_to_fit(); + + let mut out_result = ptr::null(); + xgb_call!(xgboost_sys::XGBoosterEvalOneIter(self.handle, + iteration, + s.as_mut_ptr(), + evptrs.as_mut_ptr(), + dmats.len() as u64, + &mut out_result))?; + let out = unsafe { ffi::CStr::from_ptr(out_result).to_str().unwrap().to_owned() }; + Ok(Booster::parse_eval_string(&out, &names)) + } + + /// Evaluate given matrix against this model using metrics defined in this model's parameters. + /// + /// See parameter::learning::EvaluationMetric for a full list. + /// + /// Returns a map of evaluation metric name to score. + pub fn evaluate(&self, dmat: &DMatrix) -> XGBResult> { + let name = "default"; + let mut eval = self.eval_set(&[(dmat, name)], 0)?; + let mut result = HashMap::new(); + eval.remove(name).unwrap() + .into_iter() + .for_each(|(k, v)| { + result.insert(k.to_owned(), v); + }); + + Ok(result) + } + + /// Get a string attribute that was previously set for this model. + pub fn get_attribute(&self, key: &str) -> XGBResult> { + let key = ffi::CString::new(key).unwrap(); + let mut out_buf = ptr::null(); + let mut success = 0; + xgb_call!(xgboost_sys::XGBoosterGetAttr(self.handle, key.as_ptr(), &mut out_buf, &mut success))?; + if success == 0 { + return Ok(None); + } + assert!(success == 1); + + let c_str: &ffi::CStr = unsafe { ffi::CStr::from_ptr(out_buf) }; + let out = c_str.to_str().unwrap(); + Ok(Some(out.to_owned())) + } + + /// Store a string attribute in this model with given key. + pub fn set_attribute(&mut self, key: &str, value: &str) -> XGBResult<()> { + let key = ffi::CString::new(key).unwrap(); + let value = ffi::CString::new(value).unwrap(); + xgb_call!(xgboost_sys::XGBoosterSetAttr(self.handle, key.as_ptr(), value.as_ptr())) + } + + /// Get names of all attributes stored in this model. Values can then be fetched with calls to `get_attribute`. + pub fn get_attribute_names(&self) -> XGBResult> { + let mut out_len = 0; + let mut out = ptr::null_mut(); + xgb_call!(xgboost_sys::XGBoosterGetAttrNames(self.handle, &mut out_len, &mut out))?; + + let out_ptr_slice = unsafe { slice::from_raw_parts(out, out_len as usize) }; + let out_vec = out_ptr_slice.iter() + .map(|str_ptr| unsafe { ffi::CStr::from_ptr(*str_ptr).to_str().unwrap().to_owned() }) + .collect(); + Ok(out_vec) + } + + /// Predict results for given data. + /// + /// Returns an array containing one entry per row in the given data. + pub fn predict(&self, dmat: &DMatrix) -> XGBResult> { + let option_mask = PredictOption::options_as_mask(&[]); + let ntree_limit = 0; + let mut out_len = 0; + let mut out_result = ptr::null(); + xgb_call!(xgboost_sys::XGBoosterPredict(self.handle, + dmat.handle, + option_mask, + ntree_limit, + 0, + &mut out_len, + &mut out_result))?; + + assert!(!out_result.is_null()); + let data = unsafe { slice::from_raw_parts(out_result, out_len as usize).to_vec() }; + Ok(data) + } + + /// Predict margin for given data. + /// + /// Returns an array containing one entry per row in the given data. + pub fn predict_margin(&self, dmat: &DMatrix) -> XGBResult> { + let option_mask = PredictOption::options_as_mask(&[PredictOption::OutputMargin]); + let ntree_limit = 0; + let mut out_len = 0; + let mut out_result = ptr::null(); + xgb_call!(xgboost_sys::XGBoosterPredict(self.handle, + dmat.handle, + option_mask, + ntree_limit, + 1, + &mut out_len, + &mut out_result))?; + assert!(!out_result.is_null()); + let data = unsafe { slice::from_raw_parts(out_result, out_len as usize).to_vec() }; + Ok(data) + } + + /// Get predicted leaf index for each sample in given data. + /// + /// Returns an array of shape (number of samples, number of trees) as tuple of (data, num_rows). + /// + /// Note: the leaf index of a tree is unique per tree, so e.g. leaf 1 could be found in both tree 1 and tree 0. + pub fn predict_leaf(&self, dmat: &DMatrix) -> XGBResult<(Vec, (usize, usize))> { + let option_mask = PredictOption::options_as_mask(&[PredictOption::PredictLeaf]); + let ntree_limit = 0; + let mut out_len = 0; + let mut out_result = ptr::null(); + xgb_call!(xgboost_sys::XGBoosterPredict(self.handle, + dmat.handle, + option_mask, + ntree_limit, + 0, + &mut out_len, + &mut out_result))?; + assert!(!out_result.is_null()); + + let data = unsafe { slice::from_raw_parts(out_result, out_len as usize).to_vec() }; + let num_rows = dmat.num_rows(); + let num_cols = data.len() / num_rows; + Ok((data, (num_rows, num_cols))) + } + + /// Get feature contributions (SHAP values) for each prediction. + /// + /// The sum of all feature contributions is equal to the run untransformed margin value of the + /// prediction. + /// + /// Returns an array of shape (number of samples, number of features + 1) as a tuple of + /// (data, num_rows). The final column contains the bias term. + pub fn predict_contributions(&self, dmat: &DMatrix) -> XGBResult<(Vec, (usize, usize))> { + let option_mask = PredictOption::options_as_mask(&[PredictOption::PredictContribitions]); + let ntree_limit = 0; + let mut out_len = 0; + let mut out_result = ptr::null(); + xgb_call!(xgboost_sys::XGBoosterPredict(self.handle, + dmat.handle, + option_mask, + ntree_limit, + 0, + &mut out_len, + &mut out_result))?; + assert!(!out_result.is_null()); + + let data = unsafe { slice::from_raw_parts(out_result, out_len as usize).to_vec() }; + let num_rows = dmat.num_rows(); + let num_cols = data.len() / num_rows; + Ok((data, (num_rows, num_cols))) + } + + /// Get SHAP interaction values for each pair of features for each prediction. + /// + /// The sum of each row (or column) of the interaction values equals the corresponding SHAP + /// value (from `predict_contributions`), and the sum of the entire matrix equals the raw + /// untransformed margin value of the prediction. + /// + /// Returns an array of shape (number of samples, number of features + 1, number of features + 1). + /// The final row and column contain the bias terms. + pub fn predict_interactions(&self, dmat: &DMatrix) -> XGBResult<(Vec, (usize, usize, usize))> { + let option_mask = PredictOption::options_as_mask(&[PredictOption::PredictInteractions]); + let ntree_limit = 0; + let mut out_len = 0; + let mut out_result = ptr::null(); + xgb_call!(xgboost_sys::XGBoosterPredict(self.handle, + dmat.handle, + option_mask, + ntree_limit, + 0, + &mut out_len, + &mut out_result))?; + assert!(!out_result.is_null()); + + let data = unsafe { slice::from_raw_parts(out_result, out_len as usize).to_vec() }; + let num_rows = dmat.num_rows(); + + let dim = ((data.len() / num_rows) as f64).sqrt() as usize; + Ok((data, (num_rows, dim, dim))) + } + + /// Get a dump of this model as a string. + /// + /// * `with_statistics` - whether to include statistics in output dump + /// * `feature_map` - if given, map feature IDs to feature names from given map + pub fn dump_model(&self, with_statistics: bool, feature_map: Option<&FeatureMap>) -> XGBResult { + if let Some(fmap) = feature_map { + let tmp_dir = match tempfile::tempdir() { + Ok(dir) => dir, + Err(err) => return Err(XGBError::new(err.to_string())), + }; + + let file_path = tmp_dir.path().join("fmap.txt"); + let mut file: File = match File::create(&file_path) { + Ok(f) => f, + Err(err) => return Err(XGBError::new(err.to_string())), + }; + + for (feature_num, (feature_name, feature_type)) in &fmap.0 { + writeln!(file, "{}\t{}\t{}", feature_num, feature_name, feature_type).unwrap(); + } + + self.dump_model_fmap(with_statistics, Some(&file_path)) + } else { + self.dump_model_fmap(with_statistics, None) + } + } + + fn dump_model_fmap(&self, with_statistics: bool, feature_map_path: Option<&PathBuf>) -> XGBResult { + let fmap = if let Some(path) = feature_map_path { + ffi::CString::new(path.as_os_str().as_bytes()).unwrap() + } else { + ffi::CString::new("").unwrap() + }; + let format = ffi::CString::new("text").unwrap(); + let mut out_len = 0; + let mut out_dump_array = ptr::null_mut(); + xgb_call!(xgboost_sys::XGBoosterDumpModelEx(self.handle, + fmap.as_ptr(), + with_statistics as i32, + format.as_ptr(), + &mut out_len, + &mut out_dump_array))?; + + let out_ptr_slice = unsafe { slice::from_raw_parts(out_dump_array, out_len as usize) }; + let out_vec: Vec = out_ptr_slice.iter() + .map(|str_ptr| unsafe { ffi::CStr::from_ptr(*str_ptr).to_str().unwrap().to_owned() }) + .collect(); + + assert_eq!(out_len as usize, out_vec.len()); + Ok(out_vec.join("\n")) + } + + pub(crate) fn load_rabit_checkpoint(&self) -> XGBResult { + let mut version = 0; + xgb_call!(xgboost_sys::XGBoosterLoadRabitCheckpoint(self.handle, &mut version))?; + Ok(version) + } + + pub(crate) fn save_rabit_checkpoint(&self) -> XGBResult<()> { + xgb_call!(xgboost_sys::XGBoosterSaveRabitCheckpoint(self.handle)) + } + + fn set_param(&mut self, name: &str, value: &str) -> XGBResult<()> { + let name = ffi::CString::new(name).unwrap(); + let value = ffi::CString::new(value).unwrap(); + xgb_call!(xgboost_sys::XGBoosterSetParam(self.handle, name.as_ptr(), value.as_ptr())) + } + + fn parse_eval_string(eval: &str, evnames: &[&str]) -> IndexMap> { + let mut result: IndexMap> = IndexMap::new(); + + debug!("Parsing evaluation line: {}", &eval); + for part in eval.split('\t').skip(1) { + for evname in evnames { + if part.starts_with(evname) { + let metric_parts: Vec<&str> = part[evname.len()+1..].split(':').into_iter().collect(); + assert_eq!(metric_parts.len(), 2); + let metric = metric_parts[0]; + let score = metric_parts[1].parse::() + .unwrap_or_else(|_| panic!("Unable to parse XGBoost metrics output: {}", eval)); + + let metric_map = result.entry(evname.to_string()).or_insert_with(IndexMap::new); + metric_map.insert(metric.to_owned(), score); + } + } + } + + debug!("result: {:?}", &result); + result + } + +} + +impl Drop for Booster { + fn drop(&mut self) { + xgb_call!(xgboost_sys::XGBoosterFree(self.handle)).unwrap(); + } +} + +/// Maps a feature index to a name and type, used when dumping models as text. +/// +/// See [dump_model](struct.Booster.html#method.dump_model) for usage. +pub struct FeatureMap(BTreeMap); + +impl FeatureMap { + /// Read a `FeatureMap` from a file at given path. + /// + /// File should contain one feature definition per line, and be of the form: + /// ```text + /// \t\t\n + /// ``` + /// + /// Type should be one of: + /// * `i` - binary feature + /// * `q` - quantitative feature + /// * `int` - integer features + /// + /// E.g.: + /// ```text + /// 0 age int + /// 1 is-parent?=yes i + /// 2 is-parent?=no i + /// 3 income int + /// ``` + pub fn from_file>(path: P) -> io::Result { + let file = File::open(path)?; + let mut features: FeatureMap = FeatureMap(BTreeMap::new()); + + for (i, line) in BufReader::new(&file).lines().enumerate() { + let line = line?; + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() != 3 { + let msg = format!("Unable to parse features from line {}, expected 3 tab separated values", i+1); + return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); + } + + assert_eq!(parts.len(), 3); + let feature_num: u32 = match parts[0].parse() { + Ok(num) => num, + Err(err) => { + let msg = format!("Unable to parse features from line {}, could not parse feature number: {}", + i+1, err); + return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); + } + }; + + let feature_name = &parts[1]; + let feature_type = match FeatureType::from_str(&parts[2]) { + Ok(feature_type) => feature_type, + Err(msg) => { + let msg = format!("Unable to parse features from line {}: {}", i+1, msg); + return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); + } + }; + features.0.insert(feature_num, (feature_name.to_string(), feature_type)); + } + Ok(features) + } +} + +/// Indicates the type of a feature, used when dumping models as text. +pub enum FeatureType { + /// Binary indicator feature. + Binary, + + /// Quantitative feature (e.g. age, time, etc.), can be missing. + Quantitative, + + /// Integer feature (when hinted, decision boundary will be integer). + Integer, +} + +impl FromStr for FeatureType { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "i" => Ok(FeatureType::Binary), + "q" => Ok(FeatureType::Quantitative), + "int" => Ok(FeatureType::Integer), + _ => Err(format!("unrecognised feature type '{}', must be one of: 'i', 'q', 'int'", s)) + } + } +} + +impl fmt::Display for FeatureType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let s = match self { + FeatureType::Binary => "i", + FeatureType::Quantitative => "q", + FeatureType::Integer => "int", + }; + write!(f, "{}", s) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use parameters::{self, learning, tree}; + + fn read_train_matrix() -> XGBResult { + DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.train") + } + + fn load_test_booster() -> Booster { + let dmat = read_train_matrix().expect("Reading train matrix failed"); + Booster::new_with_cached_dmats(&BoosterParameters::default(), &[&dmat]).expect("Creating Booster failed") + } + + #[test] + fn set_booster_param() { + let mut booster = load_test_booster(); + let res = booster.set_param("key", "value"); + assert!(res.is_ok()); + } + + #[test] + fn load_rabit_version() { + let version = load_test_booster().load_rabit_checkpoint().unwrap(); + assert_eq!(version, 0); + } + + #[test] + fn get_set_attr() { + let mut booster = load_test_booster(); + let attr = booster.get_attribute("foo").expect("Getting attribute failed"); + assert_eq!(attr, None); + + booster.set_attribute("foo", "bar").expect("Setting attribute failed"); + let attr = booster.get_attribute("foo").expect("Getting attribute failed"); + assert_eq!(attr, Some("bar".to_owned())); + } + + #[test] + fn save_and_load_from_buffer() { + let dmat_train = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + let mut booster = Booster::new_with_cached_dmats(&BoosterParameters::default(), &[&dmat_train]).unwrap(); + let attr = booster.get_attribute("foo").expect("Getting attribute failed"); + assert_eq!(attr, None); + + booster.set_attribute("foo", "bar").expect("Setting attribute failed"); + let attr = booster.get_attribute("foo").expect("Getting attribute failed"); + assert_eq!(attr, Some("bar".to_owned())); + + let dir = tempfile::tempdir().expect("create temp dir"); + let path = dir.path().join("test-xgboost-model"); + booster.save(&path).expect("saving booster"); + drop(booster); + let bytes = std::fs::read(&path).expect("read saved booster file"); + let booster = Booster::load_buffer(&bytes[..]).expect("load booster from buffer"); + let attr = booster.get_attribute("foo").expect("Getting attribute failed"); + assert_eq!(attr, Some("bar".to_owned())); + } + + #[test] + fn get_attribute_names() { + let mut booster = load_test_booster(); + let attrs = booster.get_attribute_names().expect("Getting attributes failed"); + assert_eq!(attrs, Vec::::new()); + + booster.set_attribute("foo", "bar").expect("Setting attribute failed"); + booster.set_attribute("another", "another").expect("Setting attribute failed"); + booster.set_attribute("4", "4").expect("Setting attribute failed"); + booster.set_attribute("an even longer attribute name?", "").expect("Setting attribute failed"); + + let mut expected = vec!["foo", "another", "4", "an even longer attribute name?"]; + expected.sort(); + let mut attrs = booster.get_attribute_names().expect("Getting attributes failed"); + attrs.sort(); + assert_eq!(attrs, expected); + } + + #[test] + fn predict() { + let dmat_train = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + let dmat_test = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.test").unwrap(); + + let tree_params = tree::TreeBoosterParametersBuilder::default() + .max_depth(2) + .eta(1.0) + .build() + .unwrap(); + let learning_params = learning::LearningTaskParametersBuilder::default() + .objective(learning::Objective::BinaryLogistic) + .eval_metrics(learning::Metrics::Custom(vec![learning::EvaluationMetric::MAPCutNegative(4), + learning::EvaluationMetric::LogLoss, + learning::EvaluationMetric::BinaryErrorRate(0.5)])) + .build() + .unwrap(); + let params = parameters::BoosterParametersBuilder::default() + .booster_type(parameters::BoosterType::Tree(tree_params)) + .learning_params(learning_params) + .verbose(false) + .build() + .unwrap(); + let mut booster = Booster::new_with_cached_dmats(¶ms, &[&dmat_train, &dmat_test]).unwrap(); + + for i in 0..10 { + booster.update(&dmat_train, i).expect("update failed"); + } + + let train_metrics = booster.evaluate(&dmat_train).unwrap(); + assert_eq!(*train_metrics.get("logloss").unwrap(), 0.006634); + assert_eq!(*train_metrics.get("map@4-").unwrap(), 0.001274); + + let test_metrics = booster.evaluate(&dmat_test).unwrap(); + assert_eq!(*test_metrics.get("logloss").unwrap(), 0.00692); + assert_eq!(*test_metrics.get("map@4-").unwrap(), 0.005155); + + let v = booster.predict(&dmat_test).unwrap(); + assert_eq!(v.len(), dmat_test.num_rows()); + + // first 10 predictions + let expected_start = [0.0050151693, + 0.9884467, + 0.0050151693, + 0.0050151693, + 0.026636455, + 0.11789363, + 0.9884467, + 0.01231471, + 0.9884467, + 0.00013656063]; + + // last 10 predictions + let expected_end = [0.002520344, + 0.00060917926, + 0.99881005, + 0.00060917926, + 0.00060917926, + 0.00060917926, + 0.00060917926, + 0.9981102, + 0.002855195, + 0.9981102]; + let eps = 1e-6; + + for (pred, expected) in v.iter().zip(&expected_start) { + println!("predictions={}, expected={}", pred, expected); + assert!(pred - expected < eps); + } + + for (pred, expected) in v[v.len()-10..].iter().zip(&expected_end) { + println!("predictions={}, expected={}", pred, expected); + assert!(pred - expected < eps); + } + } + + #[test] + fn predict_leaf() { + let dmat_train = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + let dmat_test = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.test").unwrap(); + + let tree_params = tree::TreeBoosterParametersBuilder::default() + .max_depth(2) + .eta(1.0) + .build() + .unwrap(); + let learning_params = learning::LearningTaskParametersBuilder::default() + .objective(learning::Objective::BinaryLogistic) + .eval_metrics(learning::Metrics::Custom(vec![learning::EvaluationMetric::LogLoss])) + .build() + .unwrap(); + let params = parameters::BoosterParametersBuilder::default() + .booster_type(parameters::BoosterType::Tree(tree_params)) + .learning_params(learning_params) + .verbose(false) + .build() + .unwrap(); + let mut booster = Booster::new_with_cached_dmats(¶ms, &[&dmat_train, &dmat_test]).unwrap(); + + let num_rounds = 15; + for i in 0..num_rounds { + booster.update(&dmat_train, i).expect("update failed"); + } + + let (_preds, shape) = booster.predict_leaf(&dmat_test).unwrap(); + let num_samples = dmat_test.num_rows(); + assert_eq!(shape, (num_samples, num_rounds as usize)); + } + + #[test] + fn predict_contributions() { + let dmat_train = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + let dmat_test = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.test").unwrap(); + + let tree_params = tree::TreeBoosterParametersBuilder::default() + .max_depth(2) + .eta(1.0) + .build() + .unwrap(); + let learning_params = learning::LearningTaskParametersBuilder::default() + .objective(learning::Objective::BinaryLogistic) + .eval_metrics(learning::Metrics::Custom(vec![learning::EvaluationMetric::LogLoss])) + .build() + .unwrap(); + let params = parameters::BoosterParametersBuilder::default() + .booster_type(parameters::BoosterType::Tree(tree_params)) + .learning_params(learning_params) + .verbose(false) + .build() + .unwrap(); + let mut booster = Booster::new_with_cached_dmats(¶ms, &[&dmat_train, &dmat_test]).unwrap(); + + let num_rounds = 5; + for i in 0..num_rounds { + booster.update(&dmat_train, i).expect("update failed"); + } + + let (_preds, shape) = booster.predict_contributions(&dmat_test).unwrap(); + let num_samples = dmat_test.num_rows(); + let num_features = dmat_train.num_cols(); + assert_eq!(shape, (num_samples, num_features + 1)); + } + + #[test] + fn predict_interactions() { + let dmat_train = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + let dmat_test = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.test").unwrap(); + + let tree_params = tree::TreeBoosterParametersBuilder::default() + .max_depth(2) + .eta(1.0) + .build() + .unwrap(); + let learning_params = learning::LearningTaskParametersBuilder::default() + .objective(learning::Objective::BinaryLogistic) + .eval_metrics(learning::Metrics::Custom(vec![learning::EvaluationMetric::LogLoss])) + .build() + .unwrap(); + let params = parameters::BoosterParametersBuilder::default() + .booster_type(parameters::BoosterType::Tree(tree_params)) + .learning_params(learning_params) + .verbose(false) + .build() + .unwrap(); + let mut booster = Booster::new_with_cached_dmats(¶ms, &[&dmat_train, &dmat_test]).unwrap(); + + let num_rounds = 5; + for i in 0..num_rounds { + booster.update(&dmat_train, i).expect("update failed"); + } + + let (_preds, shape) = booster.predict_interactions(&dmat_test).unwrap(); + let num_samples = dmat_test.num_rows(); + let num_features = dmat_train.num_cols(); + assert_eq!(shape, (num_samples, num_features + 1, num_features + 1)); + } + + #[test] + fn parse_eval_string() { + let s = "[0]\ttrain-map@4-:0.5\ttrain-logloss:1.0\ttest-map@4-:0.25\ttest-logloss:0.75"; + let mut metrics = IndexMap::new(); + + let mut train_metrics = IndexMap::new(); + train_metrics.insert("map@4-".to_owned(), 0.5); + train_metrics.insert("logloss".to_owned(), 1.0); + + let mut test_metrics = IndexMap::new(); + test_metrics.insert("map@4-".to_owned(), 0.25); + test_metrics.insert("logloss".to_owned(), 0.75); + + metrics.insert("train".to_owned(), train_metrics); + metrics.insert("test".to_owned(), test_metrics); + assert_eq!(Booster::parse_eval_string(s, &["train", "test"]), metrics); + } + + #[test] + fn dump_model() { + let dmat_train = DMatrix::load("xgboost-sys/xgboost/demo/data/agaricus.txt.train").unwrap(); + + println!("{:?}", dmat_train.shape()); + + let tree_params = tree::TreeBoosterParametersBuilder::default() + .max_depth(2) + .eta(1.0) + .build().unwrap(); + let learning_params = learning::LearningTaskParametersBuilder::default() + .objective(learning::Objective::BinaryLogistic) + .build().unwrap(); + let booster_params = parameters::BoosterParametersBuilder::default() + .booster_type(parameters::BoosterType::Tree(tree_params)) + .learning_params(learning_params) + .verbose(false) + .build().unwrap(); + + let training_params = parameters::TrainingParametersBuilder::default() + .booster_params(booster_params) + .dtrain(&dmat_train) + .boost_rounds(10) + .build().unwrap(); + let booster = Booster::train(&training_params).unwrap(); + + let features = FeatureMap::from_file("xgboost-sys/xgboost/demo/data/featmap.txt") + .expect("failed to parse feature map file"); + + assert_eq!(booster.dump_model(true, Some(&features)).unwrap(), +"0:[odor=none] yes=2,no=1,gain=4000.53101,cover=1628.25 +1:[stalk-root=club] yes=4,no=3,gain=1158.21204,cover=924.5 + 3:leaf=1.71217716,cover=812 + 4:leaf=-1.70044053,cover=112.5 +2:[spore-print-color=green] yes=6,no=5,gain=198.173828,cover=703.75 + 5:leaf=-1.94070864,cover=690.5 + 6:leaf=1.85964918,cover=13.25 + +0:[stalk-root=rooted] yes=2,no=1,gain=832.545044,cover=788.852051 +1:[odor=none] yes=4,no=3,gain=569.725098,cover=768.389709 + 3:leaf=0.78471756,cover=458.936859 + 4:leaf=-0.968530357,cover=309.45282 + 2:leaf=-6.23624468,cover=20.462389 + +0:[ring-type=pendant] yes=2,no=1,gain=368.744568,cover=457.069458 +1:[stalk-surface-below-ring=scaly] yes=4,no=3,gain=226.33696,cover=221.051468 + 3:leaf=0.658725023,cover=212.999451 + 4:leaf=5.77228642,cover=8.05200672 +2:[spore-print-color=purple] yes=6,no=5,gain=258.184265,cover=236.018005 + 5:leaf=-0.791407049,cover=233.487625 + 6:leaf=-9.421422,cover=2.53038669 + +0:[odor=foul] yes=2,no=1,gain=140.486069,cover=364.119354 +1:[gill-size=broad] yes=4,no=3,gain=139.860504,cover=274.101959 + 3:leaf=0.614153326,cover=95.8599854 + 4:leaf=-0.877905607,cover=178.241974 + 2:leaf=1.07747853,cover=90.0174103 + +0:[spore-print-color=green] yes=2,no=1,gain=112.605011,cover=189.202194 +1:[gill-spacing=close] yes=4,no=3,gain=66.4029999,cover=177.771835 + 3:leaf=-1.26934469,cover=42.277401 + 4:leaf=0.152607277,cover=135.494431 + 2:leaf=2.92190909,cover=11.4303684 + +0:[odor=almond] yes=2,no=1,gain=52.5610275,cover=170.612762 +1:[odor=anise] yes=4,no=3,gain=67.3869553,cover=150.881165 + 3:leaf=0.431742132,cover=131.902222 + 4:leaf=-1.53846073,cover=18.9789505 +2:[gill-spacing=close] yes=6,no=5,gain=12.4420624,cover=19.731596 + 5:leaf=-3.02413678,cover=3.65769386 + 6:leaf=-1.02315068,cover=16.0739021 + +0:[odor=none] yes=2,no=1,gain=66.2389145,cover=142.360611 +1:[odor=anise] yes=4,no=3,gain=31.2294312,cover=72.7557373 + 3:leaf=0.777142286,cover=64.5309982 + 4:leaf=-1.19710124,cover=8.22473907 +2:[spore-print-color=green] yes=6,no=5,gain=12.1987419,cover=69.6048737 + 5:leaf=-0.912605286,cover=66.1211166 + 6:leaf=0.836115122,cover=3.48375821 + +0:[gill-size=broad] yes=2,no=1,gain=20.6531773,cover=79.4027634 +1:[spore-print-color=white] yes=4,no=3,gain=16.0703697,cover=34.9289207 + 3:leaf=-0.0180106498,cover=25.0319824 + 4:leaf=1.4361918,cover=9.89693928 +2:[odor=foul] yes=6,no=5,gain=22.1144333,cover=44.4738464 + 5:leaf=-0.908311546,cover=36.982872 + 6:leaf=0.890622675,cover=7.49097395 + +0:[odor=almond] yes=2,no=1,gain=11.7128553,cover=53.3251991 +1:[ring-type=pendant] yes=4,no=3,gain=12.546154,cover=44.299942 + 3:leaf=-0.515293062,cover=15.7899179 + 4:leaf=0.56883812,cover=28.5100231 + 2:leaf=-1.01502442,cover=9.02525806 + +0:[population=clustered] yes=2,no=1,gain=14.8892794,cover=45.9312019 +1:[odor=none] yes=4,no=3,gain=10.1308851,cover=43.0564575 + 3:leaf=0.217203051,cover=22.3283749 + 4:leaf=-0.734555721,cover=20.7280827 +2:[stalk-root=missing] yes=6,no=5,gain=19.3462334,cover=2.87474418 + 5:leaf=3.63442755,cover=1.34154534 + 6:leaf=-0.609474957,cover=1.53319895 +"); + } +} diff --git a/pgml-extension/pgml_rust/rust-xgboost/src/dmatrix.rs b/pgml-extension/pgml_rust/rust-xgboost/src/dmatrix.rs new file mode 100644 index 000000000..0488e0fca --- /dev/null +++ b/pgml-extension/pgml_rust/rust-xgboost/src/dmatrix.rs @@ -0,0 +1,486 @@ +use std::{slice, ffi, ptr, path::Path}; +use libc::{c_uint, c_float}; +use std::os::unix::ffi::OsStrExt; +use std::convert::TryInto; + +use xgboost_sys; + +use super::{XGBResult, XGBError}; + +static KEY_GROUP_PTR: &'static str = "group_ptr"; +static KEY_GROUP: &'static str = "group"; +static KEY_LABEL: &'static str = "label"; +static KEY_WEIGHT: &'static str = "weight"; +static KEY_BASE_MARGIN: &'static str = "base_margin"; + +/// Data matrix used throughout XGBoost for training/predicting [`Booster`](struct.Booster.html) models. +/// +/// It's used as a container for both features (i.e. a row for every instance), and an optional true label for that +/// instance (as an `f32` value). +/// +/// Can be created files, or from dense or sparse +/// ([CSR](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) +/// or [CSC](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_(CSC_or_CCS))) matrices. +/// +/// # Examples +/// +/// ## Load from file +/// +/// Load matrix from file in [LIBSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) or binary format. +/// +/// ```should_panic +/// use xgboost::DMatrix; +/// +/// let dmat = DMatrix::load("somefile.txt").unwrap(); +/// ``` +/// +/// ## Create from dense array +/// +/// ``` +/// use xgboost::DMatrix; +/// +/// let data = &[1.0, 0.5, 0.2, 0.2, +/// 0.7, 1.0, 0.1, 0.1, +/// 0.2, 0.0, 0.0, 1.0]; +/// let num_rows = 3; +/// let mut dmat = DMatrix::from_dense(data, num_rows).unwrap(); +/// assert_eq!(dmat.shape(), (3, 4)); +/// +/// // set true labels for each row +/// dmat.set_labels(&[1.0, 0.0, 1.0]); +/// ``` +/// +/// ## Create from sparse CSR matrix +/// +/// Create from sparse representation of +/// ```text +/// [[1.0, 0.0, 2.0], +/// [0.0, 0.0, 3.0], +/// [4.0, 5.0, 6.0]] +/// ``` +/// +/// ``` +/// use xgboost::DMatrix; +/// +/// let indptr = &[0, 2, 3, 6]; +/// let indices = &[0, 2, 2, 0, 1, 2]; +/// let data = &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]; +/// let dmat = DMatrix::from_csr(indptr, indices, data, None).unwrap(); +/// assert_eq!(dmat.shape(), (3, 3)); +/// ``` +pub struct DMatrix { + pub(super) handle: xgboost_sys::DMatrixHandle, + num_rows: usize, + num_cols: usize, +} + +impl DMatrix { + /// Construct a new instance from a DMatrixHandle created by the XGBoost C API. + fn new(handle: xgboost_sys::DMatrixHandle) -> XGBResult { + // number of rows/cols are frequently read throughout applications, so more convenient to pull them out once + // when the matrix is created, instead of having to check errors each time XGDMatrixNum* is called + let mut out = 0; + xgb_call!(xgboost_sys::XGDMatrixNumRow(handle, &mut out))?; + let num_rows = out as usize; + + let mut out = 0; + xgb_call!(xgboost_sys::XGDMatrixNumCol(handle, &mut out))?; + let num_cols = out as usize; + + info!("Loaded DMatrix with shape: {}x{}", num_rows, num_cols); + Ok(DMatrix { handle, num_rows, num_cols }) + } + + /// Create a new `DMatrix` from dense array in row-major order. + /// + /// E.g. the matrix + /// ```text + /// [[1.0, 2.0], + /// [3.0, 4.0], + /// [5.0, 6.0]] + /// ``` + /// would be represented converted into a `DMatrix` with + /// ``` + /// use xgboost::DMatrix; + /// + /// let data = &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]; + /// let num_rows = 3; + /// let dmat = DMatrix::from_dense(data, num_rows).unwrap(); + /// ``` + pub fn from_dense(data: &[f32], num_rows: usize) -> XGBResult { + let mut handle = ptr::null_mut(); + xgb_call!(xgboost_sys::XGDMatrixCreateFromMat(data.as_ptr(), + num_rows as xgboost_sys::bst_ulong, + (data.len() / num_rows) as xgboost_sys::bst_ulong, + 0.0, // TODO: can values be missing here? + &mut handle))?; + Ok(DMatrix::new(handle)?) + } + + /// Create a new `DMatrix` from a sparse + /// [CSR](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) matrix. + /// + /// Uses standard CSR representation where the column indices for row _i_ are stored in + /// `indices[indptr[i]:indptr[i+1]]` and their corresponding values are stored in + /// `data[indptr[i]:indptr[i+1]`. + /// + /// If `num_cols` is set to None, number of columns will be inferred from given data. + pub fn from_csr(indptr: &[usize], indices: &[usize], data: &[f32], num_cols: Option) -> XGBResult { + assert_eq!(indices.len(), data.len()); + let mut handle = ptr::null_mut(); + let indptr: Vec = indptr.iter().map(|x| *x as u64).collect(); + let indices: Vec = indices.iter().map(|x| *x as u32).collect(); + let num_cols = num_cols.unwrap_or(0); // infer from data if 0 + xgb_call!(xgboost_sys::XGDMatrixCreateFromCSREx(indptr.as_ptr(), + indices.as_ptr(), + data.as_ptr(), + indptr.len().try_into().unwrap(), + data.len().try_into().unwrap(), + num_cols.try_into().unwrap(), + &mut handle))?; + Ok(DMatrix::new(handle)?) + } + + /// Create a new `DMatrix` from a sparse + /// [CSC](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_(CSC_or_CCS))) matrix. + /// + /// Uses standard CSC representation where the row indices for column _i_ are stored in + /// `indices[indptr[i]:indptr[i+1]]` and their corresponding values are stored in + /// `data[indptr[i]:indptr[i+1]`. + /// + /// If `num_rows` is set to None, number of rows will be inferred from given data. + pub fn from_csc(indptr: &[usize], indices: &[usize], data: &[f32], num_rows: Option) -> XGBResult { + assert_eq!(indices.len(), data.len()); + let mut handle = ptr::null_mut(); + let indptr: Vec = indptr.iter().map(|x| *x as u64).collect(); + let indices: Vec = indices.iter().map(|x| *x as u32).collect(); + let num_rows = num_rows.unwrap_or(0); // infer from data if 0 + xgb_call!(xgboost_sys::XGDMatrixCreateFromCSCEx(indptr.as_ptr(), + indices.as_ptr(), + data.as_ptr(), + indptr.len().try_into().unwrap(), + data.len().try_into().unwrap(), + num_rows.try_into().unwrap(), + &mut handle))?; + Ok(DMatrix::new(handle)?) + } + + /// Create a new `DMatrix` from given file. + /// + /// Supports text files in [LIBSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) format, CSV, + /// binary files written either by `save`, or from another XGBoost library. + /// + /// For more details on accepted formats, seem the + /// [XGBoost input format](https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html) + /// documentation. + /// + /// # LIBSVM format + /// + /// Specified data in a sparse format as: + /// ```text + ///