diff --git a/examples/digits/run.sql b/examples/digits/run.sql new file mode 100644 index 000000000..539c03053 --- /dev/null +++ b/examples/digits/run.sql @@ -0,0 +1,42 @@ +-- This example trains models on the sklean digits dataset +-- which is a copy of the test set of the UCI ML hand-written digits datasets +-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits +-- +-- The final result after a few seconds of training is not terrible. Maybe not perfect +-- enough for mission critical applications, but it's telling how quickly "off the shelf" +-- solutions can solve problems these days. +SELECT pgml.load_dataset('digits'); + +-- view the dataset +SELECT * from pgml.digits; + +-- train a simple model to classify the data +SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target'); + +-- check out the predictions +SELECT target, pgml.predict('Handwritten Digit Image Classifier', image) AS prediction +FROM pgml.digits +LIMIT 10; + +-- -- train some more models with different algorithms +SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'svm'); +SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'random_forest'); +SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'gradient_boosting_trees'); +-- TODO SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'dense_neural_network'); +-- -- check out all that hard work +SELECT * FROM pgml.trained_models; + +-- deploy the random_forest model for prediction use +SELECT pgml.deploy('Handwritten Digit Image Classifier', 'random_forest'); +-- check out that throughput +SELECT * FROM pgml.deployed_models; + +-- do some hyper param tuning +-- TODO SELECT pgml.hypertune(100, 'Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'gradient_boosted_trees'); +-- deploy the "best" model for prediction use +SELECT pgml.deploy('Handwritten Digit Image Classifier', 'best_fit'); + +-- check out the improved predictions +SELECT target, pgml.predict('Handwritten Digit Image Classifier', image) AS prediction +FROM pgml.digits +LIMIT 10; diff --git a/pgml/pgml/__init__.py b/pgml/pgml/__init__.py index 534003028..b3a53da87 100644 --- a/pgml/pgml/__init__.py +++ b/pgml/pgml/__init__.py @@ -1,2 +1,2 @@ def version(): - return "0.4.1" + return "0.4.2" diff --git a/pgml/pgml/datasets.py b/pgml/pgml/datasets.py new file mode 100644 index 000000000..d997254cf --- /dev/null +++ b/pgml/pgml/datasets.py @@ -0,0 +1,21 @@ +import plpy +from sklearn.datasets import load_digits as d + +from pgml.sql import q +from pgml.exceptions import PgMLException + +def load(source: str): + if source == "digits": + load_digits() + else: + raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].") + return "OK" + +def load_digits(): + dataset = d() + a = plpy.execute("DROP TABLE IF EXISTS pgml.digits") + a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)") + a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""") + for X, y in zip(dataset["data"], dataset["target"]): + X = ",".join("%i" % x for x in list(X)) + plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""") diff --git a/pgml/pgml/model.py b/pgml/pgml/model.py index 4c265a984..e8a721550 100644 --- a/pgml/pgml/model.py +++ b/pgml/pgml/model.py @@ -1,14 +1,23 @@ +from re import M import plpy -from sklearn.linear_model import LinearRegression -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.svm import SVR, SVC +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier from sklearn.model_selection import train_test_split -from sklearn.metrics import mean_squared_error, r2_score +from sklearn.metrics import mean_squared_error, r2_score, f1_score, precision_score, recall_score import pickle +import json from pgml.exceptions import PgMLException from pgml.sql import q +def flatten(S): + if S == []: + return S + if isinstance(S[0], list): + return flatten(S[0]) + flatten(S[1:]) + return S[:1] + flatten(S[1:]) class Project(object): """ @@ -124,6 +133,14 @@ def deployed_model(self): self._deployed_model = Model.find_deployed(self.id) return self._deployed_model + def deploy(self, algorithm_name): + model = None + if algorithm_name == "best_fit": + model = Model.find_by_project_and_best_fit(self) + else: + model = Model.find_by_project_id_and_algorithm_name(self.id, algorithm_name) + model.deploy() + return model class Snapshot(object): """ @@ -178,7 +195,7 @@ def create( plpy.execute( f""" CREATE TABLE pgml."snapshot_{snapshot.id}" AS - SELECT * FROM "{snapshot.relation_name}"; + SELECT * FROM {snapshot.relation_name}; """ ) snapshot.__dict__ = dict( @@ -232,6 +249,7 @@ def data(self): for column in columns: x_.append(row[column]) + x_ = flatten(x_) # TODO be smart about flattening X depending on algorithm X.append(x_) y.append(y_) @@ -262,8 +280,7 @@ class Model(object): status (str): The current status of the model, e.g. 'new', 'training' or 'successful' created_at (Timestamp): when this model was created updated_at (Timestamp): when this model was last updated - mean_squared_error (float): - r2_score (float): + metrics (dict): key performance indicators for the model pickle (bytes): the serialized version of the model parameters algorithm: the in memory version of the model parameters that can make predictions """ @@ -320,6 +337,63 @@ def find_deployed(cls, project_id: int): model.__init__() return model + @classmethod + def find_by_project_id_and_algorithm_name(cls, project_id: int, algorithm_name: str): + """ + Args: + project_id (int): The project id + algorithm_name (str): The algorithm + Returns: + Model: most recently created model that fits the criteria + """ + result = plpy.execute( + f""" + SELECT models.* + FROM pgml.models + WHERE algorithm_name = {q(algorithm_name)} + AND project_id = {q(project_id)} + ORDER by models.created_at DESC + LIMIT 1 + """ + ) + if len(result) == 0: + return None + + model = Model() + model.__dict__ = dict(result[0]) + model.__init__() + return model + + @classmethod + def find_by_project_and_best_fit(cls, project: Project): + """ + Args: + project (Project): The project + Returns: + Model: the model with the best metrics for the project + """ + if project.objective == "regression": + metric = "mean_squared_error" + elif project.objective == "classification": + metric = "f1" + + result = plpy.execute( + f""" + SELECT models.* + FROM pgml.models + WHERE project_id = {q(project.id)} + ORDER by models.metrics->>{q(metric)} DESC + LIMIT 1 + """ + ) + if len(result) == 0: + return None + + model = Model() + model.__dict__ = dict(result[0]) + model.__init__() + return model + def __init__(self): self._algorithm = None self._project = None @@ -342,8 +416,13 @@ def algorithm(self): else: self._algorithm = { "linear_regression": LinearRegression, + "linear_classification": LogisticRegression, + "svm_regression": SVR, + "svm_classification": SVC, "random_forest_regression": RandomForestRegressor, "random_forest_classification": RandomForestClassifier, + "gradient_boosting_trees_regression": GradientBoostingRegressor, + "gradient_boosting_trees_classification": GradientBoostingClassifier, }[self.algorithm_name + "_" + self.project.objective]() return self._algorithm @@ -362,8 +441,14 @@ def fit(self, snapshot: Snapshot): # Test y_pred = self.algorithm.predict(X_test) - msq = mean_squared_error(y_test, y_pred) - r2 = r2_score(y_test, y_pred) + metrics = {} + if self.project.objective == "regression": + metrics["mean_squared_error"] = mean_squared_error(y_test, y_pred) + metrics["r2"] = r2_score(y_test, y_pred) + elif self.project.objective == "classification": + metrics["f1"] = f1_score(y_test, y_pred, average="weighted") + metrics["precision"] = precision_score(y_test, y_pred, average="weighted") + metrics["recall"] = recall_score(y_test, y_pred, average="weighted") # Save the model self.__dict__ = dict( @@ -372,8 +457,7 @@ def fit(self, snapshot: Snapshot): UPDATE pgml.models SET pickle = '\\x{pickle.dumps(self.algorithm).hex()}', status = 'successful', - mean_squared_error = {q(msq)}, - r2_score = {q(r2)} + metrics = {q(json.dumps(metrics))} WHERE id = {q(self.id)} RETURNING * """ @@ -398,6 +482,7 @@ def predict(self, data: list): Returns: float or int: scores for regressions or ints for classifications """ + # TODO: add metrics for tracking prediction volume/accuracy by model return self.algorithm.predict(data) @@ -406,6 +491,7 @@ def train( objective: str, relation_name: str, y_column_name: str, + algorithm_name: str = "linear", test_size: float or int = 0.1, test_sampling: str = "random", ): @@ -416,15 +502,14 @@ def train( objective (str): Defaults to "regression". Valid values are ["regression", "classification"]. relation_name (str): the table or view that stores the training data y_column_name (str): the column in the training data that acts as the label - algorithm (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "random_forest"]. + algorithm_name (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "svm", "random_forest", "gradient_boosting"]. test_size (float or int, optional): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If train_size is also None, it will be set to 0.25. test_sampling: (str, optional): How to sample to create the test data. Defaults to "random". Valid values are ["first", "last", "random"]. """ - if objective == "regression": - algorithms = ["linear", "random_forest"] - elif objective == "classification": - algorithms = ["random_forest"] - else: + if algorithm_name is None: + algorithm_name = "linear" + + if objective not in ["regression", "classification"]: raise PgMLException( f"Unknown objective `{objective}`, available options are: regression, classification." ) @@ -440,23 +525,11 @@ def train( ) snapshot = Snapshot.create(relation_name, y_column_name, test_size, test_sampling) - deployed = Model.find_deployed(project.id) - - # Let's assume that the deployed model is better for now. - best_model = deployed - best_error = best_model.mean_squared_error if best_model else None - - for algorithm_name in algorithms: - model = Model.create(project, snapshot, algorithm_name) - model.fit(snapshot) + model = Model.create(project, snapshot, algorithm_name) + model.fit(snapshot) - # Find the better model and deploy that. - if best_error is None or model.mean_squared_error < best_error: - best_error = model.mean_squared_error - best_model = model - - if deployed and deployed.id == best_model.id: - return "rolled back" - else: - best_model.deploy() + if project.deployed_model is None: + model.deploy() return "deployed" + else: + return "not deployed" diff --git a/sql/install.sql b/sql/install.sql index e18fdc9aa..5eadea438 100644 --- a/sql/install.sql +++ b/sql/install.sql @@ -6,7 +6,7 @@ CREATE EXTENSION IF NOT EXISTS plpython3u; --- --- Create schema for models. --- --- DROP SCHEMA pgml CASCADE; +DROP SCHEMA pgml CASCADE; CREATE SCHEMA IF NOT EXISTS pgml; CREATE OR REPLACE FUNCTION pgml.auto_updated_at(tbl regclass) @@ -70,8 +70,7 @@ CREATE TABLE IF NOT EXISTS pgml.models( status TEXT NOT NULL, created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT clock_timestamp(), updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT clock_timestamp(), - mean_squared_error DOUBLE PRECISION, - r2_score DOUBLE PRECISION, + metrics JSONB, pickle BYTEA, CONSTRAINT project_id_fk FOREIGN KEY(project_id) REFERENCES pgml.projects(id), CONSTRAINT snapshot_id_fk FOREIGN KEY(snapshot_id) REFERENCES pgml.snapshots(id) @@ -101,27 +100,46 @@ AS $$ $$ LANGUAGE plpython3u; --- ---- Regression +--- Load data --- -DROP FUNCTION IF EXISTS pgml.train(project_name TEXT, objective TEXT, relation_name TEXT, y_column_name TEXT); -CREATE OR REPLACE FUNCTION pgml.train(project_name TEXT, objective TEXT, relation_name TEXT, y_column_name TEXT) +CREATE OR REPLACE FUNCTION pgml.load_dataset(source TEXT) +RETURNS TEXT +AS $$ + from pgml.datasets import load + return load(source) +$$ LANGUAGE plpython3u; + +--- +--- Train +--- +CREATE OR REPLACE FUNCTION pgml.train(project_name TEXT, objective TEXT, relation_name TEXT, y_column_name TEXT, algorithm TEXT DEFAULT NULL) RETURNS TABLE(project_name TEXT, objective TEXT, status TEXT) AS $$ from pgml.model import train - status = train(project_name, objective, relation_name, y_column_name) + status = train(project_name, objective, relation_name, y_column_name, algorithm) return [(project_name, objective, status)] $$ LANGUAGE plpython3u; +--- +--- Deploy +--- +CREATE OR REPLACE FUNCTION pgml.deploy(project_name TEXT, algorithm_name TEXT) +RETURNS TABLE(project_name TEXT, objective TEXT, algorithm_name TEXT) +AS $$ + from pgml.model import Project + model = Project.find_by_name(project_name).deploy(algorithm_name) + return [(model.project.name, model.project.objective, model.algorithm_name)] +$$ LANGUAGE plpython3u; + --- --- Predict --- -CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, VARIADIC features DOUBLE PRECISION[]) +CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features NUMERIC[]) RETURNS DOUBLE PRECISION AS $$ from pgml.model import Project - return Project.find_by_name(project_name).deployed_model.predict([features,])[0] $$ LANGUAGE plpython3u; @@ -135,8 +153,7 @@ SELECT d.created_at AS deployed_at, p.objective, m.algorithm_name, - m.mean_squared_error, - m.r2_score, + m.metrics, s.relation_name, s.y_column_name, s.test_sampling, @@ -147,3 +164,50 @@ INNER JOIN pgml.deployments d ON d.project_id = p.id AND d.model_id = m.id INNER JOIN pgml.snapshots s ON s.id = m.snapshot_id ORDER BY d.created_at DESC; + + +--- +--- List details of trained models. +--- +DROP VIEW IF EXISTS pgml.trained_models; +CREATE VIEW pgml.trained_models AS +SELECT + p.name, + p.objective, + m.algorithm_name, + m.metrics, + m.created_at, + s.test_sampling, + s.test_size, + d.model_id IS NOT NULL AS deployed +FROM pgml.projects p +INNER JOIN pgml.models m ON p.id = m.project_id +INNER JOIN pgml.snapshots s ON s.id = m.snapshot_id +LEFT JOIN ( + SELECT DISTINCT ON(project_id) + project_id, model_id, created_at + FROM pgml.deployments + ORDER BY project_id, created_at desc +) d ON d.model_id = m.id +ORDER BY m.created_at DESC; + +--- +--- List details of deployed models. +--- +DROP VIEW IF EXISTS pgml.deployed_models; +CREATE VIEW pgml.deployed_models AS +SELECT + p.name, + p.objective, + m.algorithm_name, + m.metrics, + d.created_at as deployed_at +FROM pgml.projects p +INNER JOIN ( + SELECT DISTINCT ON(project_id) + project_id, model_id, created_at + FROM pgml.deployments + ORDER BY project_id, created_at desc +) d ON d.project_id = p.id +INNER JOIN pgml.models m ON m.id = d.model_id +ORDER BY p.name ASC; diff --git a/sql/test.sql b/sql/test.sql index 7522f83ec..822c0955b 100644 --- a/sql/test.sql +++ b/sql/test.sql @@ -9,11 +9,11 @@ SELECT pgml.version(); \timing SELECT pgml.train('Red Wine Scores', 'regression', 'wine_quality_red', 'quality'); -SELECT pgml.predict('Red Wine Scores', 7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4); -SELECT pgml.predict('Red Wine Scores', 6.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4); -SELECT pgml.predict('Red Wine Scores', 5.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4); -SELECT pgml.predict('Red Wine Scores', 3.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4); +SELECT pgml.predict('Red Wine Scores', '{7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4}'); +SELECT pgml.predict('Red Wine Scores', '{6.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4}'); +SELECT pgml.predict('Red Wine Scores', '{5.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4}'); +SELECT pgml.predict('Red Wine Scores', '{3.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4}'); -SELECT pgml.train('Red Wine Categories', 'classification', 'wine_quality_red', 'quality'); -SELECT pgml.predict('Red Wine Categories', 7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4); +SELECT pgml.train('Red Wine Categories', 'classification', 'wine_quality_red', 'quality', 'svm'); +SELECT pgml.predict('Red Wine Categories', '{7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4}');
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: