data loading in rust

montanalow · montanalow · commit 995e0b2acea0 · 2023-02-28T16:30:15.000-08:00
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
@@ -485,18 +485,23 @@ fn snapshot(
 #[pg_extern]
 fn load_dataset(
     source: &str,
+    subset: default!(Option<String>, "NULL"),
     limit: default!(Option<i64>, "NULL"),
+    kwargs: default!(JsonB, "'{}'"),
 ) -> TableIterator<'static, (name!(table_name, String), name!(rows, i64))> {
     // cast limit since pgx doesn't support usize
     let limit: Option<usize> = limit.map(|limit| limit.try_into().unwrap());
     let (name, rows) = match source {
-        "breast_cancer" => crate::orm::dataset::load_breast_cancer(limit),
-        "diabetes" => crate::orm::dataset::load_diabetes(limit),
-        "digits" => crate::orm::dataset::load_digits(limit),
-        "iris" => crate::orm::dataset::load_iris(limit),
-        "linnerud" => crate::orm::dataset::load_linnerud(limit),
-        "wine" => crate::orm::dataset::load_wine(limit),
-        _ => error!("Unknown source: `{source}`"),
+        "breast_cancer" => dataset::load_breast_cancer(limit),
+        "diabetes" => dataset::load_diabetes(limit),
+        "digits" => dataset::load_digits(limit),
+        "iris" => dataset::load_iris(limit),
+        "linnerud" => dataset::load_linnerud(limit),
+        "wine" => dataset::load_wine(limit),
+        _ => {
+            let rows = crate::bindings::transformers::load_dataset(source, subset, limit, &kwargs.0);
+            (source.into(), rows as i64)
+        },
     };
 
     TableIterator::new(vec![(name, rows)].into_iter())
@@ -537,7 +542,7 @@ fn tune(
     task: default!(Option<Task>, "NULL"),
     relation_name: default!(Option<&str>, "NULL"),
     y_column_name: default!(Option<&str>, "NULL"),
-    algorithm: default!(Algorithm, "transformers"),
+    algorithm: default!(Option<&str>, "NULL"),
     hyperparams: default!(JsonB, "'{}'"),
     search: default!(Option<Search>, "NULL"),
     search_params: default!(JsonB, "'{}'"),
@@ -608,14 +613,16 @@ fn tune(
         }
     };
 
+    let model_name = algorithm;
+
     // # Default repeatable random state when possible
     // let algorithm = Model.algorithm_from_name_and_task(algorithm, task);
     // if "random_state" in algorithm().get_params() and "random_state" not in hyperparams:
     //     hyperparams["random_state"] = 0
     let model = Model::create(
         &project,
         &mut snapshot,
-        algorithm,
+        Algorithm::transformers,
         hyperparams,
         search,
         search_params,
diff --git a/pgml-extension/src/bindings/sklearn.rs b/pgml-extension/src/bindings/sklearn.rs
@@ -9,13 +9,25 @@
 /// defined in `src/bindings/sklearn.py`.
 use std::collections::HashMap;
 
+use once_cell::sync::Lazy;
 use pyo3::prelude::*;
 use pyo3::types::PyTuple;
 
 use crate::bindings::Bindings;
 
 use crate::orm::*;
 
+static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(||
+    Python::with_gil(|py| -> Py<PyModule> {
+        let src = include_str!(concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/src/bindings/sklearn.py"
+        ));
+
+        PyModule::from_code(py, src, "", "").unwrap().into()
+    })
+);
+
 pub fn linear_regression(dataset: &Dataset, hyperparams: &Hyperparams) -> Box<dyn Bindings> {
     fit(dataset, hyperparams, "linear_regression")
 }
@@ -290,17 +302,11 @@ fn fit(
     hyperparams: &Hyperparams,
     algorithm_task: &'static str,
 ) -> Box<dyn Bindings> {
-    let module = include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/src/bindings/sklearn.py"
-    ));
-
     let hyperparams = serde_json::to_string(hyperparams).unwrap();
 
     let (estimator, predict, predict_proba) =
         Python::with_gil(|py| -> (Py<PyAny>, Py<PyAny>, Py<PyAny>) {
-            let module = PyModule::from_code(py, module, "", "").unwrap();
-            let estimator: Py<PyAny> = module.getattr("estimator").unwrap().into();
+            let estimator: Py<PyAny> = PY_MODULE.getattr(py, "estimator").unwrap().into();
 
             let train: Py<PyAny> = estimator
                 .call1(
@@ -321,20 +327,20 @@ fn fit(
                 .call1(py, PyTuple::new(py, &[&dataset.x_train, &dataset.y_train]))
                 .unwrap();
 
-            let predict: Py<PyAny> = module
-                .getattr("predictor")
+            let predict: Py<PyAny> = PY_MODULE
+                .getattr(py, "predictor")
                 .unwrap()
-                .call1(PyTuple::new(py, &[&estimator]))
+                .call1(py, PyTuple::new(py, &[&estimator]))
                 .unwrap()
-                .extract()
+                .extract(py)
                 .unwrap();
 
-            let predict_proba: Py<PyAny> = module
-                .getattr("predictor_proba")
+            let predict_proba: Py<PyAny> = PY_MODULE
+                .getattr(py, "predictor_proba")
                 .unwrap()
-                .call1(PyTuple::new(py, &[&estimator]))
+                .call1(py, PyTuple::new(py, &[&estimator]))
                 .unwrap()
-                .extract()
+                .extract(py)
                 .unwrap();
 
             (estimator, predict, predict_proba)
@@ -389,17 +395,11 @@ impl Bindings for Estimator {
 
     /// Serialize self to bytes
     fn to_bytes(&self) -> Vec<u8> {
-        let module = include_str!(concat!(
-            env!("CARGO_MANIFEST_DIR"),
-            "/src/bindings/sklearn.py"
-        ));
-
         Python::with_gil(|py| -> Vec<u8> {
-            let module = PyModule::from_code(py, module, "", "").unwrap();
-            let save = module.getattr("save").unwrap();
-            save.call1(PyTuple::new(py, &[&self.estimator]))
+            let save = PY_MODULE.getattr(py, "save").unwrap();
+            save.call1(py, PyTuple::new(py, &[&self.estimator]))
                 .unwrap()
-                .extract()
+                .extract(py)
                 .unwrap()
         })
     }
@@ -409,34 +409,28 @@ impl Bindings for Estimator {
     where
         Self: Sized,
     {
-        let module = include_str!(concat!(
-            env!("CARGO_MANIFEST_DIR"),
-            "/src/bindings/sklearn.py"
-        ));
-
         Python::with_gil(|py| -> Box<dyn Bindings> {
-            let module = PyModule::from_code(py, module, "", "").unwrap();
-            let load = module.getattr("load").unwrap();
+            let load = PY_MODULE.getattr(py, "load").unwrap();
             let estimator: Py<PyAny> = load
-                .call1(PyTuple::new(py, &[bytes]))
+                .call1(py,PyTuple::new(py, &[bytes]))
                 .unwrap()
-                .extract()
+                .extract(py)
                 .unwrap();
 
-            let predict: Py<PyAny> = module
-                .getattr("predictor")
+            let predict: Py<PyAny> = PY_MODULE
+                .getattr(py,"predictor")
                 .unwrap()
-                .call1(PyTuple::new(py, &[&estimator]))
+                .call1(py,PyTuple::new(py, &[&estimator]))
                 .unwrap()
-                .extract()
+                .extract(py)
                 .unwrap();
 
-            let predict_proba: Py<PyAny> = module
-                .getattr("predictor_proba")
+            let predict_proba: Py<PyAny> = PY_MODULE
+                .getattr(py, "predictor_proba")
                 .unwrap()
-                .call1(PyTuple::new(py, &[&estimator]))
+                .call1(py,PyTuple::new(py, &[&estimator]))
                 .unwrap()
-                .extract()
+                .extract(py)
                 .unwrap();
 
             Box::new(Estimator {
@@ -449,18 +443,12 @@ impl Bindings for Estimator {
 }
 
 fn sklearn_metric(name: &str, ground_truth: &[f32], y_hat: &[f32]) -> f32 {
-    let module = include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/src/bindings/sklearn.py"
-    ));
-
     Python::with_gil(|py| -> f32 {
-        let module = PyModule::from_code(py, module, "", "").unwrap();
-        let calculate_metric = module.getattr("calculate_metric").unwrap();
+        let calculate_metric = PY_MODULE.getattr(py, "calculate_metric").unwrap();
         let wrapper: Py<PyAny> = calculate_metric
-            .call1(PyTuple::new(py, &[name]))
+            .call1(py,PyTuple::new(py, &[name]))
             .unwrap()
-            .extract()
+            .extract(py)
             .unwrap();
 
         let score: f32 = wrapper
@@ -490,18 +478,12 @@ pub fn recall(ground_truth: &[f32], y_hat: &[f32]) -> f32 {
 }
 
 pub fn confusion_matrix(ground_truth: &[f32], y_hat: &[f32]) -> Vec<Vec<f32>> {
-    let module = include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/src/bindings/sklearn.py"
-    ));
-
     Python::with_gil(|py| -> Vec<Vec<f32>> {
-        let module = PyModule::from_code(py, module, "", "").unwrap();
-        let calculate_metric = module.getattr("calculate_metric").unwrap();
+        let calculate_metric = PY_MODULE.getattr(py, "calculate_metric").unwrap();
         let wrapper: Py<PyAny> = calculate_metric
-            .call1(PyTuple::new(py, &["confusion_matrix"]))
+            .call1(py,PyTuple::new(py, &["confusion_matrix"]))
             .unwrap()
-            .extract()
+            .extract(py)
             .unwrap();
 
         let matrix: Vec<Vec<f32>> = wrapper
@@ -515,18 +497,12 @@ pub fn confusion_matrix(ground_truth: &[f32], y_hat: &[f32]) -> Vec<Vec<f32>> {
 }
 
 pub fn regression_metrics(ground_truth: &[f32], y_hat: &[f32]) -> HashMap<String, f32> {
-    let module = include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/src/bindings/sklearn.py"
-    ));
-
     Python::with_gil(|py| -> HashMap<String, f32> {
-        let module = PyModule::from_code(py, module, "", "").unwrap();
-        let calculate_metric = module.getattr("regression_metrics").unwrap();
+        let calculate_metric = PY_MODULE.getattr(py,"regression_metrics").unwrap();
         let scores: HashMap<String, f32> = calculate_metric
-            .call1(PyTuple::new(py, &[ground_truth, y_hat]))
+            .call1(py,PyTuple::new(py, &[ground_truth, y_hat]))
             .unwrap()
-            .extract()
+            .extract(py)
             .unwrap();
 
         scores
@@ -538,18 +514,12 @@ pub fn classification_metrics(
     y_hat: &[f32],
     num_classes: usize,
 ) -> HashMap<String, f32> {
-    let module = include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/src/bindings/sklearn.py"
-    ));
-
     let mut scores = Python::with_gil(|py| -> HashMap<String, f32> {
-        let module = PyModule::from_code(py, module, "", "").unwrap();
-        let calculate_metric = module.getattr("classification_metrics").unwrap();
+        let calculate_metric = PY_MODULE.getattr(py, "classification_metrics").unwrap();
         let scores: HashMap<String, f32> = calculate_metric
-            .call1(PyTuple::new(py, &[ground_truth, y_hat]))
+            .call1(py,PyTuple::new(py, &[ground_truth, y_hat]))
             .unwrap()
-            .extract()
+            .extract(py)
             .unwrap();
 
         scores
@@ -564,12 +534,8 @@ pub fn classification_metrics(
 }
 
 pub fn package_version(name: &str) -> String {
-    let mut version = String::new();
-
-    Python::with_gil(|py| {
+    Python::with_gil(|py| -> String {
         let package = py.import(name).unwrap();
-        version = package.getattr("__version__").unwrap().extract().unwrap();
-    });
-
-    version
+        package.getattr("__version__").unwrap().extract().unwrap()
+    })
 }
diff --git a/pgml-extension/src/bindings/transformers.py b/pgml-extension/src/bindings/transformers.py
@@ -14,75 +14,33 @@ def transform(task, args, inputs):
 
     return json.dumps(pipe(inputs, **args))
 
-def load_dataset(name, subset, limit: None, **kwargs):
+def load_dataset(name, subset, limit: None, kwargs: "{}"):
+    kwargs = json.loads(kwargs)
+
     if limit:
         dataset = datasets.load_dataset(name, subset, split=f"train[:{limit}]", **kwargs)
     else:
         dataset = datasets.load_dataset(name, subset, **kwargs)
 
+    dict = None
     if isinstance(dataset, datasets.Dataset):
-        sample = dataset[0]
+        sample = dataset.to_dict()
     elif isinstance(dataset, datasets.DatasetDict):
-        sample = dataset["train"][0]
+        dict = {}
+        # Merge train/test splits, we'll re-split back in PostgresML.
+        for name, split in dataset.items():
+            for field, values in split.to_dict().items():
+                if field in dict:
+                    dict[field] += values
+                else:
+                    dict[field] = values
     else:
         raise PgMLException(f"Unhandled dataset type: {type(dataset)}")
 
-    columns = OrderedDict()
-    for key, value in sample.items():
-        column = c(key)
-        columns[column] = _PYTHON_TO_PG_MAP[type(value)]
-
-    table_name = f"pgml.{c(name)}"
-    plpy.execute(f"DROP TABLE IF EXISTS {table_name}")
-    plpy.execute(f"""CREATE TABLE {table_name} ({", ".join([f"{name} {type}" for name, type in columns.items()])})""")
-
-    if isinstance(dataset, datasets.Dataset):
-        load_dataset_rows(dataset, table_name)
-    elif isinstance(dataset, datasets.DatasetDict):
-        for name, rows in dataset.items():
-            if name == "unsupervised":
-                # postgresml doesn't provide unsupervised learning methods
-                continue
-            load_dataset_rows(rows, table_name)
-
-
-def load_dataset_rows(rows, table_name):
-    for row in rows:
-        plpy.execute(
-            f"""INSERT INTO {table_name} ({", ".join([c(v) for v in row.keys()])}) 
-            VALUES ({", ".join([q(v) for v in row.values()])})"""
-        )
-
-
-def transform(task, args, inputs):
-    cache = args.pop("cache", True)
-
-    # construct the cache key from task
-    key = task
-    if type(key) == dict:
-        key = tuple(sorted(key.items()))
-
-    if cache and key in _pipeline_cache:
-        pipe = _pipeline_cache.get(key)
-    else:
-        with timer("Initializing pipeline"):
-            if type(task) == str:
-                pipe = transformers.pipeline(task)
-            else:
-                pipe = transformers.pipeline(**task)
-            if cache:
-                _pipeline_cache[key] = pipe
-
-    if pipe.task == "question-answering":
-        inputs = [json.loads(input) for input in inputs]
-
-    with timer("inference"):
-        result = pipe(inputs, **args)
-
-    return result
+    return json.dumps(dict)
 
 
-class Model(BaseModel):
+class Model:
     @property
     def algorithm(self):
         if self._algorithm is None:
diff --git a/pgml-extension/src/bindings/transformers.rs b/pgml-extension/src/bindings/transformers.rs