postgresml · montanalow · May 6, 2024 · May 5, 2024 · May 5, 2024 · May 5, 2024
diff --git a/pgml-cms/docs/api/sql-extension/pgml.decompose.md b/pgml-cms/docs/api/sql-extension/pgml.decompose.md
@@ -0,0 +1,50 @@
+---
+description: Decompose an input vector into it's principal components
+---
+
+# pgml.decompose()
+
+
+Chunks are pieces of documents split using some specified splitter. This is typically done before embedding.
+
+## API
+
+```sql
+pgml.decompose(
+    project_name TEXT, -- project name
+    vector REAL[]      -- features to decompose
+)
+```
+
+### Parameters
+
+| Parameter      | Example                         | Description                                              |
+|----------------|---------------------------------|----------------------------------------------------------|
+| `project_name` | `'My First PostgresML Project'` | The project name used to train models in `pgml.train()`. |
+| `vector`       | `ARRAY[0.1, 0.45, 1.0]`         | The feature vector that needs decomposition.             |
+
+## Example
+
+```sql
+SELECT pgml.decompose('My PCA', ARRAY[0.1, 2.0, 5.0]);
+```
+
+!!! example
+
+```sql
+SELECT *,
+    pgml.decompose(
+        'Buy it Again',
+        ARRAY[
+            user.location_id,
+            NOW() - user.created_at,
+            user.total_purchases_in_dollars
+        ]
+    ) AS buying_score
+FROM users
+WHERE tenant_id = 5
+ORDER BY buying_score
+LIMIT 25;
+```
+
+!!!
diff --git a/pgml-cms/docs/api/sql-extension/pgml.train/clustering.md b/pgml-cms/docs/api/sql-extension/pgml.train/clustering.md
@@ -16,8 +16,8 @@ SELECT image FROM pgml.digits;
 -- view the dataset
 SELECT left(image::text, 40) || ',...}' FROM pgml.digit_vectors LIMIT 10;
 
--- train a simple model to classify the data
-SELECT * FROM pgml.train('Handwritten Digit Clusters', 'cluster', 'pgml.digit_vectors', hyperparams => '{"n_clusters": 10}');
+-- train a simple model to cluster the data
+SELECT * FROM pgml.train('Handwritten Digit Clusters', 'clustering', 'pgml.digit_vectors', hyperparams => '{"n_clusters": 10}');
 
 -- check out the predictions
 SELECT target, pgml.predict('Handwritten Digit Clusters', image) AS prediction
@@ -27,7 +27,7 @@ LIMIT 10;
 
 ## Algorithms
 
-All clustering algorithms implemented by PostgresML are online versions. You may use the [pgml.predict](../../../api/sql-extension/pgml.predict/ "mention")function to cluster novel datapoints after the clustering model has been trained.
+All clustering algorithms implemented by PostgresML are online versions. You may use the [pgml.predict](../../../api/sql-extension/pgml.predict/ "mention")function to cluster novel data points after the clustering model has been trained.
 
 | Algorithm              | Reference                                                                                                         |
 | ---------------------- | ----------------------------------------------------------------------------------------------------------------- |

diff --git a/pgml-cms/docs/api/sql-extension/pgml.train/decomposition.md b/pgml-cms/docs/api/sql-extension/pgml.train/decomposition.md
@@ -0,0 +1,42 @@
+# Decomposition
+
+Models can be trained using `pgml.train` on unlabeled data to identify important features within the data. To decompose a dataset into it's principal components, we can use the table or a view. Since decomposition is an unsupervised algorithm, we don't need a column that represents a label as one of the inputs to `pgml.train`.
+
+## Example
+
+This example trains models on the sklearn digits dataset -- which is a copy of the test set of the [UCI ML hand-written digits datasets](https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits). This demonstrates using a table with a single array feature column for principal component analysis. You could do something similar with a vector column.
+
+```sql
+SELECT pgml.load_dataset('digits');
+
+-- create an unlabeled table of the images for unsupervised learning
+CREATE VIEW pgml.digit_vectors AS
+SELECT image FROM pgml.digits;
+
+-- view the dataset
+SELECT left(image::text, 40) || ',...}' FROM pgml.digit_vectors LIMIT 10;
+
+-- train a simple model to cluster the data
+SELECT * FROM pgml.train('Handwritten Digit Components', 'decomposition', 'pgml.digit_vectors', hyperparams => '{"n_components": 3}');
+
+-- check out the compenents
+SELECT target, pgml.decompose('Handwritten Digit Components', image) AS pca
+FROM pgml.digits
+LIMIT 10;
+```
+
+Note that the input vectors have been reduced from 64 dimensions to 3, which explain nearly half of the variance across all samples.
+
+## Algorithms
+
+All decomposition algorithms implemented by PostgresML are online versions. You may use the [pgml.decompose](../../../api/sql-extension/pgml.decompose "mention") function to decompose novel data points after the model has been trained.
+
+| Algorithm                 | Reference                                                                                                           |
+|---------------------------|---------------------------------------------------------------------------------------------------------------------|
+| `pca` | [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) |
+
+### Examples
+
+```sql
+SELECT * FROM pgml.train('Handwritten Digit Clusters', algorithm => 'pca', hyperparams => '{"n_components": 10}');
+```
diff --git a/pgml-dashboard/src/models.rs b/pgml-dashboard/src/models.rs
@@ -55,10 +55,11 @@ impl Project {
         match self.task.as_ref().unwrap().as_str() {
             "classification" | "text_classification" | "question_answering" => Ok("f1"),
             "regression" => Ok("r2"),
+            "clustering" => Ok("silhouette"),
+            "decomposition" => Ok("cumulative_explained_variance"),
             "summarization" => Ok("rouge_ngram_f1"),
             "translation" => Ok("bleu"),
             "text_generation" | "text2text" => Ok("perplexity"),
-            "cluster" => Ok("silhouette"),
             task => Err(anyhow::anyhow!("Unhandled task: {}", task)),
         }
     }
@@ -67,10 +68,11 @@ impl Project {
         match self.task.as_ref().unwrap().as_str() {
             "classification" | "text_classification" | "question_answering" => Ok("F<sup>1</sup>"),
             "regression" => Ok("R<sup>2</sup>"),
+            "clustering" => Ok("silhouette"),
+            "decomposition" => Ok("Cumulative Explained Variance"),
             "summarization" => Ok("Rouge Ngram F<sup>1</sup>"),
             "translation" => Ok("Bleu"),
             "text_generation" | "text2text" => Ok("Perplexity"),
-            "cluster" => Ok("silhouette"),
             task => Err(anyhow::anyhow!("Unhandled task: {}", task)),
         }
     }

diff --git a/pgml-extension/.cargo/config → pgml-extension/.cargo/config.toml b/pgml-extension/.cargo/config → pgml-extension/.cargo/config.toml
diff --git a/pgml-extension/Cargo.lock b/pgml-extension/Cargo.lock
diff --git a/pgml-extension/Cargo.toml b/pgml-extension/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pgml"
-version = "2.8.3"
+version = "2.8.4"
 edition = "2021"
 
 [lib]

diff --git a/pgml-extension/examples/cluster.sql → pgml-extension/examples/clustering.sql b/pgml-extension/examples/cluster.sql → pgml-extension/examples/clustering.sql
@@ -20,7 +20,7 @@ SELECT image FROM pgml.digits;
 SELECT left(image::text, 40) || ',...}' FROM pgml.digit_vectors LIMIT 10;
 
 -- train a simple model to classify the data
-SELECT * FROM pgml.train('Handwritten Digit Clusters', 'cluster', 'pgml.digit_vectors', hyperparams => '{"n_clusters": 10}');
+SELECT * FROM pgml.train('Handwritten Digit Clusters', 'clustering', 'pgml.digit_vectors', hyperparams => '{"n_clusters": 10}');
 
 -- check out the predictions
 SELECT target, pgml.predict('Handwritten Digit Clusters', image) AS prediction

diff --git a/pgml-extension/examples/decomposition.sql b/pgml-extension/examples/decomposition.sql
@@ -0,0 +1,60 @@
+-- This example reduces the dimensionality of images in the sklean digits dataset
+-- which is a copy of the test set of the UCI ML hand-written digits datasets
+-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
+--
+-- This demonstrates using a table with a single array feature column
+-- for decomposition to reduce dimensionality.
+--
+-- Exit on error (psql)
+-- \set ON_ERROR_STOP true
+\timing on
+
+SELECT pgml.load_dataset('digits');
+
+-- view the dataset
+SELECT left(image::text, 40) || ',...}', target FROM pgml.digits LIMIT 10;
+
+-- create a view of just the vectors for decomposition, without any labels
+CREATE VIEW digit_vectors AS
+SELECT image FROM pgml.digits;
+
+SELECT * FROM pgml.train('Handwritten Digits Reduction', 'decomposition', 'digit_vectors');
+
+-- check out the decomposed vectors
+SELECT target, pgml.decompose('Handwritten Digits Reduction', image) AS pca
+FROM pgml.digits
+LIMIT 10;
+
+--
+-- After a project has been trained, omitted parameters will be reused from previous training runs
+-- In these examples we'll reuse the training data snapshots from the initial call.
+--
+
+-- We can reduce the image vectors from 64 dimensions to 3 components
+SELECT * FROM pgml.train('Handwritten Digits Reduction', hyperparams => '{"n_components": 3}');
+
+-- check out the reduced vectors
+SELECT target, pgml.decompose('Handwritten Digits Reduction', image) AS pca
+FROM pgml.digits
+LIMIT 10;
+
+-- check out all that hard work
+SELECT trained_models.* FROM pgml.trained_models
+                                 JOIN pgml.models on models.id = trained_models.id
+ORDER BY models.metrics->>'cumulative_explained_variance' DESC LIMIT 5;
+
+-- deploy the PCA model for prediction use
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'most_recent', 'pca');
+-- check out that throughput
+SELECT * FROM pgml.deployed_models ORDER BY deployed_at DESC LIMIT 5;
+
+-- deploy the "best" model for prediction use
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'best_score');
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'most_recent');
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'rollback');
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'best_score', 'pca');
+
+-- check out the improved predictions
+SELECT target, pgml.predict('Handwritten Digits Reduction', image) AS prediction
+FROM pgml.digits
+LIMIT 10;
diff --git a/pgml-extension/examples/image_classification.sql b/pgml-extension/examples/image_classification.sql
@@ -5,9 +5,8 @@
 -- This demonstrates using a table with a single array feature column
 -- for classification.
 --
--- The final result after a few seconds of training is not terrible. Maybe not perfect
--- enough for mission critical applications, but it's telling how quickly "off the shelf" 
--- solutions can solve problems these days.
+-- Some algorithms converge on this trivial dataset in under a second, demonstrating the
+-- speed with which modern machines can "learn" from example data.
 
 -- Exit on error (psql)
 -- \set ON_ERROR_STOP true

diff --git a/pgml-extension/examples/regression.sql b/pgml-extension/examples/regression.sql
@@ -1,4 +1,4 @@
--- This example trains models on the sklean diabetes dataset
+-- This example trains models on the sklearn diabetes dataset
 -- Source URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
 -- For more information see:
 --   Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004)

diff --git a/pgml-extension/sql/pgml--2.8.3--2.8.4.sql b/pgml-extension/sql/pgml--2.8.3--2.8.4.sql
@@ -0,0 +1,13 @@
+ALTER TYPE pgml.task RENAME VALUE 'cluster' TO 'clustering';
+ALTER TYPE pgml.task ADD VALUE IF NOT EXISTS 'decomposition';
+
+ALTER TYPE pgml.algorithm ADD VALUE IF NOT EXISTS 'pca';
+
+-- pgml::api::decompose
+CREATE FUNCTION pgml."decompose"(
+    "project_name" TEXT, /* alloc::string::String */
+    "vector" FLOAT4[] /* Vec<f32> */
+) RETURNS FLOAT4[] /* Vec<f32> */
+    IMMUTABLE STRICT PARALLEL SAFE
+LANGUAGE c /* Rust */
+AS 'MODULE_PATHNAME', 'decompose_wrapper';
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
@@ -225,8 +225,10 @@ fn train_joint(
     };
 
     // fix up default algorithm for clustering
-    let algorithm = if algorithm == Algorithm::linear && project.task == Task::cluster {
+    let algorithm = if algorithm == Algorithm::linear && project.task == Task::clustering {
         Algorithm::kmeans
+    } else if algorithm == Algorithm::linear && project.task == Task::decomposition {
+        Algorithm::pca
     } else {
         algorithm
     };
@@ -482,6 +484,13 @@ fn predict_batch(project_name: &str, features: Vec<f32>) -> SetOfIterator<'stati
     ))
 }
 
+#[pg_extern(immutable, parallel_safe, strict, name = "decompose")]
+fn decompose(project_name: &str, vector: Vec<f32>) -> Vec<f32> {
+    let model_id = Project::get_deployed_model_id(project_name);
+    let model = unwrap_or_error!(Model::find_cached(model_id));
+    unwrap_or_error!(model.decompose(&vector))
+}
+
 #[pg_extern(immutable, parallel_safe, strict, name = "predict")]
 fn predict_row(project_name: &str, row: pgrx::datum::AnyElement) -> f32 {
     predict_model_row(Project::get_deployed_model_id(project_name), row)

diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs
@@ -78,12 +78,24 @@ pub mod xgboost;
 
 pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Result<Box<dyn Bindings>>;
 
+use std::any::Any;
+
+pub trait AToAny: 'static {
+    fn as_any(&self) -> &dyn Any;
+}
+
+impl<T: 'static> AToAny for T {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
 /// The Bindings trait that has to be implemented by all algorithm
 /// providers we use in PostgresML. We don't rely on Serde serialization,
 /// since scikit-learn estimators were originally serialized in pure Python as
-/// pickled objects, and neither xgboost or linfa estimators completely
+/// pickled objects, and neither xgboost nor linfa estimators completely
 /// implement serde.
-pub trait Bindings: Send + Sync + Debug {
+pub trait Bindings: Send + Sync + Debug + AToAny {
     /// Predict a set of datapoints.
     fn predict(&self, features: &[f32], num_features: usize, num_classes: usize) -> Result<Vec<f32>>;
 

diff --git a/pgml-extension/src/bindings/sklearn/mod.rs b/pgml-extension/src/bindings/sklearn/mod.rs
@@ -14,7 +14,11 @@ use anyhow::Result;
 use pyo3::prelude::*;
 use pyo3::types::PyTuple;
 
-use crate::{bindings::Bindings, create_pymodule, orm::*};
+use crate::{
+    bindings::{Bindings, TracebackError},
+    create_pymodule,
+    orm::*,
+};
 
 create_pymodule!("/src/bindings/sklearn/sklearn.py");
 
@@ -35,8 +39,8 @@ wrap_fit!(random_forest_regression, "random_forest_regression");
 wrap_fit!(xgboost_regression, "xgboost_regression");
 wrap_fit!(xgboost_random_forest_regression, "xgboost_random_forest_regression");
 wrap_fit!(
-    orthogonal_matching_persuit_regression,
-    "orthogonal_matching_persuit_regression"
+    orthogonal_matching_pursuit_regression,
+    "orthogonal_matching_pursuit_regression"
 );
 wrap_fit!(bayesian_ridge_regression, "bayesian_ridge_regression");
 wrap_fit!(
@@ -109,6 +113,8 @@ wrap_fit!(spectral, "spectral_clustering");
 wrap_fit!(spectral_bi, "spectral_biclustering");
 wrap_fit!(spectral_co, "spectral_coclustering");
 
+wrap_fit!(pca, "pca_decomposition");
+
 fn fit(dataset: &Dataset, hyperparams: &Hyperparams, algorithm_task: &'static str) -> Result<Box<dyn Bindings>> {
     let hyperparams = serde_json::to_string(hyperparams).unwrap();
 
@@ -293,9 +299,9 @@ pub fn classification_metrics(ground_truth: &[f32], y_hat: &[f32], num_classes:
     Ok(scores)
 }
 
-pub fn cluster_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> Result<HashMap<String, f32>> {
+pub fn clustering_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> Result<HashMap<String, f32>> {
     Python::with_gil(|py| {
-        let calculate_metric = get_module!(PY_MODULE).getattr(py, "cluster_metrics")?;
+        let calculate_metric = get_module!(PY_MODULE).getattr(py, "clustering_metrics")?;
 
         let scores: HashMap<String, f32> = calculate_metric
             .call1(py, (num_features, PyTuple::new(py, [inputs, labels])))?
@@ -304,3 +310,15 @@ pub fn cluster_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> R
         Ok(scores)
     })
 }
+
+pub fn decomposition_metrics(bindings: &Box<dyn Bindings>) -> Result<HashMap<String, f32>> {
+    Python::with_gil(|py| match bindings.as_any().downcast_ref::<Estimator>() {
+        Some(estimator) => {
+            let calculate_metric = get_module!(PY_MODULE).getattr(py, "decomposition_metrics")?;
+            let metrics = calculate_metric.call1(py, PyTuple::new(py, [&estimator.estimator]));
+            let metrics = metrics.format_traceback(py)?.extract(py).format_traceback(py)?;
+            Ok(metrics)
+        }
+        None => error!("Can't compute decomposition metrics for bindings other than sklearn"),
+    })
+}